From bbb86100df0cb9b7153bad3f27964422ec19ae0c Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 16 Jul 2025 11:09:18 -0400
Subject: [PATCH 01/61] rename script

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/{offline_inference.py => offline_inference_llama.py} | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
 rename scripts/{offline_inference.py => offline_inference_llama.py} (94%)

diff --git a/scripts/offline_inference.py b/scripts/offline_inference_llama.py
similarity index 94%
rename from scripts/offline_inference.py
rename to scripts/offline_inference_llama.py
index d29412d70..82ba6d974 100644
--- a/scripts/offline_inference.py
+++ b/scripts/offline_inference_llama.py
@@ -41,8 +41,7 @@
     from vllm.distributed import cleanup_dist_env_and_memory
 
     llm = LLM(
-        # model="/mnt/nvme5n1p1/zrlngl/fmaas/models/llama3.1-8b-instruct/",
-        model="/net/storage149/autofs/css22/nmg/models/hf/meta-llama/Llama-3.1-8B-Instruct/main/",
+        model="meta-llama/Llama-3.1-8B-Instruct",
         # max_model_len=2048,
         # enforce_eager=True,
         enable_prefix_caching=False,

From d53287adffc8d4e558681b864cccc55040eeaa9e Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 16 Jul 2025 11:10:13 -0400
Subject: [PATCH 02/61] getting flashinfer installed in container

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 Dockerfile | 74 ++++++++++++++++++++++++++++++++++++++++++++++++------
 vllm       |  2 +-
 2 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 38274a7cc..5fb522a29 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,6 +3,7 @@ ARG BASE_UBI_IMAGE_TAG=9.4
 ARG PYTHON_VERSION=3.12
 ARG MAX_JOBS=64
 ARG PIP_VLLM_VERSION=0.8.1
+# TODO add ARG CUDA_VERSION=12-8
 
 ARG VLLM_SOURCE=pip 
 # or VLLM_SOURCE=custom 
@@ -122,6 +123,31 @@ ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     python3 setup.py bdist_wheel --dist-dir=/workspace/
 
+# ## flashinfer Builder #################################################################
+# FROM vllm-builder_custom AS flashinfer-builder
+# ARG MAX_JOBS
+# 
+# # # build deps?
+# # RUN --mount=type=cache,target=/root/.cache/pip \
+# #     --mount=type=cache,target=/root/.cache/uv \
+# #     uv pip install ninja cmake wheel pybind11 setuptools
+# 
+# WORKDIR /workspace/flashinfer
+# RUN git clone --recursive https://github.com/flashinfer-ai/flashinfer.git
+# 
+# ENV TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
+# ENV FLASHINFER_ENABLE_SM90=1
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     cd flashinfer \
+#     && export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} export FLASHINFER_ENABLE_SM90=${FLASHINFER_ENABLE_SM90} \
+#     && python -m flashinfer.aot \
+#     && python -m build --no-isolation --wheel
+#     
+#     # uv pip install \
+#     # --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
+# 
+# RUN ls -al /workspace/flashinfer/flashinfer/dist
+
 ## Runtime #################################################################
 FROM base AS runtime
 
@@ -227,20 +253,54 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     uv pip install pytest llnl-hatchet debugpy
 
 # Install FlashInfer
-RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
-    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    . /etc/environment && \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+# RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
+#     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     . /etc/environment && \
+#     python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     . /etc/environment && \
+#     uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     uv pip install flashinfer-python -i https://flashinfer.ai/whl/cu124/torch2.6/ --no-deps
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     --mount=type=cache,target=/root/.cache/uv \
+#     uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.5/flashinfer_python-0.2.5+cu124torch2.6-cp38-abi3-linux_x86_64.whl#sha256=43d767b912c0c43a04be99595e0123eab9385fc72530a2874b5fb08e3145c0be
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     --mount=type=cache,target=/root/.cache/uv \
+#     uv pip install torch==2.7.0
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     --mount=type=cache,target=/root/.cache/uv \
+#     uv pip install https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl
+# RUN mkdir /workspace/flashinfer_dist && ls -al /workspace/flashinfer_dist
+# COPY --from=flashinfer-builder /workspace/*.whl /workspace/flashinfer_dist
+# RUN --mount=type=cache,target=/root/.cache/pip \
+#     --mount=type=cache,target=/root/.cache/uv \
+#     uv pip install /workspace/flashinfer_dist/*.whl
+# TODO: we need nvcc for flashinfer installation...custom build fails, see above
+RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
+        https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
+RUN microdnf install -y \
+        cuda-nvcc-12-8 cuda-nvtx-12-8 cuda-libraries-devel-12-8 && \
+    microdnf clean all
+ENV CUDA_HOME="/usr/local/cuda" \
+    PATH="${CUDA_HOME}/bin:${PATH}" \
+    LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
+ENV TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
+ENV FLASHINFER_ENABLE_SM90=1
+RUN TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} FLASHINFER_ENABLE_SM90=${FLASHINFER_ENABLE_SM90} uv pip install \
+    --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
 
+    
 RUN ln -s ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_cupti/lib/libcupti.so.12  ${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_cupti/lib/libcupti.so
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=cache,target=/root/.cache/uv \
     git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness && cd lm-evaluation-harness && uv pip install .
 
-RUN git clone --depth 1 https://github.com/IBM/fmwork.git
+# RUN git clone --depth 1 https://github.com/IBM/fmwork.git
+# RUN git clone --depth 1 https://github.com/IBM/fmwork.git
+COPY fmwork fmwork
 
 ENV STORE_TEST_RESULT_PATH=/results
 
diff --git a/vllm b/vllm
index d91278181..a9019b760 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit d91278181d89686b73b2ec88c2db4d55c6c506cb
+Subproject commit a9019b7608d258db59651bcf7678c0e291a6d4ce

From 720dc2a85b290236f6a57cecb0905bf0ab60f44d Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 16 Jul 2025 11:10:21 -0400
Subject: [PATCH 03/61] getting offline script to run

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/offline_inference_llama.py |  3 +-
 scripts/offline_inference_mamba.py | 97 ++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 scripts/offline_inference_mamba.py

diff --git a/scripts/offline_inference_llama.py b/scripts/offline_inference_llama.py
index 82ba6d974..3c213e6c9 100644
--- a/scripts/offline_inference_llama.py
+++ b/scripts/offline_inference_llama.py
@@ -41,7 +41,8 @@
     from vllm.distributed import cleanup_dist_env_and_memory
 
     llm = LLM(
-        model="meta-llama/Llama-3.1-8B-Instruct",
+        # model="meta-llama/Llama-3.1-8B-Instruct",
+        model=f"{os.environ["MY_MODEL_PATH"]}",
         # max_model_len=2048,
         # enforce_eager=True,
         enable_prefix_caching=False,
diff --git a/scripts/offline_inference_mamba.py b/scripts/offline_inference_mamba.py
new file mode 100644
index 000000000..151b0bf84
--- /dev/null
+++ b/scripts/offline_inference_mamba.py
@@ -0,0 +1,97 @@
+#  /*******************************************************************************
+#   * Copyright 2025 IBM Corporation
+#   *
+#   * Licensed under the Apache License, Version 2.0 (the "License");
+#   * you may not use this file except in compliance with the License.
+#   * You may obtain a copy of the License at
+#   *
+#   *     http://www.apache.org/licenses/LICENSE-2.0
+#   *
+#   * Unless required by applicable law or agreed to in writing, software
+#   * distributed under the License is distributed on an "AS IS" BASIS,
+#   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   * See the License for the specific language governing permissions and
+#   * limitations under the License.
+#  *******************************************************************************/
+#
+
+
+import os
+import time
+
+# to enable debug printing
+# os.environ["TRITON_BACKEND_DEBUG"] = "1"
+
+# to use triton_attn backend
+os.environ["VLLM_USE_V1"] = "1"
+os.environ["VLLM_PLUGINS"] = ""
+# os.environ["VLLM_ATTENTION_BACKEND"] = "TRITON_ATTN_VLLM_V1"
+os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
+# os.environ["VLLM_TRITON_ENABLE_JITCACHE"] = "1"
+os.environ["VLLM_TRITON_ENABLE_JITCACHE"] = "0"
+
+# enable torch profiler, can also be set on cmd line
+enable_profiling = True
+# enable_profiling = False
+
+if enable_profiling:
+    os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_torch_profile_mamba"
+
+
+if __name__ == "__main__":
+    from vllm import LLM, SamplingParams
+    from vllm.distributed import cleanup_dist_env_and_memory
+
+    llm = LLM(
+        model=f"{os.environ["MY_MODEL_PATH"]}",
+        # enforce_eager=True,
+        enable_chunked_prefill=True,
+        enable_prefix_caching=False,
+        tensor_parallel_size=2,
+        max_model_len=31628,
+        max_num_seqs=512,
+        num_scheduler_steps=1,
+    )
+
+    # batch_size = 32
+    max_tokens = 20
+
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
+    # ignore_eos=True)
+
+    prompts = [
+        "Zurich is a beautiful city with",
+        # "San Francisco is a large city with",
+        # "Provide a list of instructions for preparing chicken soup for a family "
+        # "of four.",
+        # "Skating and cross country skiing technique differ in",
+    ]
+
+    print(
+        f"SETUP: vllm backend: {os.environ['VLLM_ATTENTION_BACKEND']}  "
+        f"  JITCache: {os.environ['VLLM_TRITON_ENABLE_JITCACHE']}    "
+    )
+    print(f"Inference with {len(prompts)} prompts...")
+    if enable_profiling:
+        llm.start_profile()
+    t0 = time.time()
+    # outputs = llm.generate(prompts, sampling_params)
+    outputs = []
+    for prompt in prompts:
+        outputs.append(llm.generate(prompt, sampling_params))
+
+    if enable_profiling:
+        llm.stop_profile()
+    t1 = time.time()
+
+    print(f"inference time: {t1-t0:.5f}s")
+
+    for output in outputs:
+        output = output[0]  # in case of loop above
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Add a buffer to wait for profiler in the background process
+    # (in case MP is on) to finish writing profiling output.
+    time.sleep(10)

From 7e8c19c49492e13e169fe3501b14b59cc09f24fb Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Thu, 17 Jul 2025 09:25:40 -0400
Subject: [PATCH 04/61] triton dejavu for granite 4

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                        |  26 ++
 .../default/cache.json                        |  25 ++
 .../default/cache.json                        |  31 ++
 .../default/cache.json                        |  26 ++
 .../default/cache.json                        |  30 ++
 .../default/cache.json                        |  28 ++
 .../default/cache.json                        |   8 +
 .../default/cache.json                        |   8 +
 .../default/cache.json                        | 347 ++++++++++++++++++
 scripts/offline_inference_mamba.py            |   4 +-
 vllm                                          |   2 +-
 11 files changed, 532 insertions(+), 3 deletions(-)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-c01d6c3dfb6d587c5fb5a1edbe6d606a9804204c3305d997bb82640bf3e80282/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
new file mode 100755
index 000000000..0225f79be
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
@@ -0,0 +1,26 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_bmm:_bmm_chunk_fwd_kernel)",
+    "total_bench_time_s": 10.309182405471802,
+    "evaluated_configs": 9,
+    "keys": [
+        "chunk_size",
+        "K",
+        "IS_CAUSAL"
+    ],
+    "cache": {
+        "('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 128, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": [
+            0.04188799858093262
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
new file mode 100755
index 000000000..5b20369a8
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
@@ -0,0 +1,25 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_cumsum_fwd_kernel)",
+    "total_bench_time_s": 8.378965139389038,
+    "evaluated_configs": 7,
+    "keys": [
+        "chunk_size",
+        "nheads"
+    ],
+    "cache": {
+        "('256', '64', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": "BLOCK_SIZE_H: 32, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('256', '64', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": [
+            0.05206400156021118
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json
new file mode 100755
index 000000000..14c211cf5
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json
@@ -0,0 +1,31 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_scan:_chunk_scan_fwd_kernel)",
+    "total_bench_time_s": 36.24500060081482,
+    "evaluated_configs": 11,
+    "keys": [
+        "chunk_size",
+        "hdim",
+        "dstate",
+        "IS_CAUSAL"
+    ],
+    "cache": {
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 256, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": [
+            0.20547200739383698
+        ],
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32')": [
+            0.6873279809951782
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
new file mode 100755
index 000000000..2aeb42c51
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
@@ -0,0 +1,26 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_fwd_kernel)",
+    "total_bench_time_s": 10.325033903121948,
+    "evaluated_configs": 9,
+    "keys": [
+        "hdim",
+        "dstate",
+        "chunk_size"
+    ],
+    "cache": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 128, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": [
+            0.08188799768686295
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
new file mode 100755
index 000000000..3b86e0dae
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
@@ -0,0 +1,30 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_varlen_kernel)",
+    "total_bench_time_s": 23.77578854560852,
+    "evaluated_configs": 9,
+    "keys": [
+        "hdim",
+        "dstate",
+        "chunk_size"
+    ],
+    "cache": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 128, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16')": [
+            0.09270399808883667
+        ],
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16')": [
+            0.01027199998497963
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json
new file mode 100755
index 000000000..04198714b
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json
@@ -0,0 +1,28 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_state_passing:_state_passing_fwd_kernel)",
+    "total_bench_time_s": 9.725267887115479,
+    "evaluated_configs": 6,
+    "keys": [
+        "dim"
+    ],
+    "cache": {
+        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE: 512, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32')": "BLOCK_SIZE: 512, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')": [
+            0.059007998555898666
+        ],
+        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32')": [
+            0.08220800012350082
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
new file mode 100755
index 000000000..2540ac5c3
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-c01d6c3dfb6d587c5fb5a1edbe6d606a9804204c3305d997bb82640bf3e80282/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-c01d6c3dfb6d587c5fb5a1edbe6d606a9804204c3305d997bb82640bf3e80282/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
new file mode 100755
index 000000000..a7c2af725
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-c01d6c3dfb6d587c5fb5a1edbe6d606a9804204c3305d997bb82640bf3e80282/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
new file mode 100755
index 000000000..04eb1f234
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
@@ -0,0 +1,347 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
+    "total_bench_time_s": 34544.99443292618,
+    "evaluated_configs": 540,
+    "keys": [
+        "MAX_SEQ_Q",
+        "MAX_SEQ_K",
+        "AVG_SEQ_Q",
+        "AVG_SEQ_K",
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3"
+    ],
+    "cache": {
+        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.003466148627921939
+        ],
+        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.003575095208361745
+        ],
+        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.004993442911654711
+        ],
+        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.006109926383942366
+        ],
+        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.03988393768668175
+        ],
+        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.09943539649248123
+        ],
+        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.3283151388168335
+        ],
+        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            1.0377004146575928
+        ],
+        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0033776038326323032
+        ],
+        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.003488453570753336
+        ],
+        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0033901487477123737
+        ],
+        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0032401704229414463
+        ],
+        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.004394480027258396
+        ],
+        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.004883989226073027
+        ],
+        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0045789312571287155
+        ],
+        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.006259772460907698
+        ],
+        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.010929320007562637
+        ],
+        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.040549296885728836
+        ],
+        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.02016238309442997
+        ],
+        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.1051921397447586
+        ],
+        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.03749670833349228
+        ],
+        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.3411431908607483
+        ],
+        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0701025053858757
+        ],
+        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            1.0497854948043823
+        ],
+        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0034944734070450068
+        ],
+        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0042336732149124146
+        ],
+        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.005933090578764677
+        ],
+        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.026846082881093025
+        ],
+        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.07565699517726898
+        ],
+        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.2685732841491699
+        ],
+        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.8566849827766418
+        ],
+        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.003527216147631407
+        ],
+        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.004583046771585941
+        ],
+        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0060236589051783085
+        ],
+        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.026979871094226837
+        ],
+        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.08126690983772278
+        ],
+        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.2932415306568146
+        ],
+        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.8659728765487671
+        ],
+        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.00306075531989336
+        ],
+        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0034781373105943203
+        ],
+        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.003616524860262871
+        ],
+        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0030675148591399193
+        ],
+        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0038118616212159395
+        ],
+        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.003134604310616851
+        ],
+        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0055700079537928104
+        ],
+        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.009849821217358112
+        ],
+        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.014783395454287529
+        ],
+        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.04928915575146675
+        ],
+        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.15255023539066315
+        ],
+        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.013137963600456715
+        ],
+        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.4398653507232666
+        ],
+        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            1.4163719415664673
+        ],
+        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0033607585355639458
+        ],
+        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0038107747677713633
+        ],
+        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.004322108346968889
+        ],
+        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0033715730533003807
+        ],
+        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.004160675685852766
+        ],
+        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.004942106083035469
+        ],
+        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.00334966741502285
+        ],
+        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0050212424248456955
+        ],
+        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.007804282940924168
+        ],
+        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.007798833306878805
+        ],
+        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.014028973877429962
+        ],
+        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.03204701468348503
+        ],
+        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.08394649624824524
+        ],
+        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.08103202283382416
+        ],
+        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.23096241056919098
+        ],
+        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.006906270515173674
+        ],
+        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.23079754412174225
+        ],
+        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.7025490999221802
+        ],
+        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.6989444494247437
+        ],
+        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            2.3537752628326416
+        ],
+        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.004250869620591402
+        ],
+        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.005911743268370628
+        ],
+        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.011380953714251518
+        ],
+        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.05582933872938156
+        ],
+        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.16943588852882385
+        ],
+        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.4909878969192505
+        ],
+        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            1.5911381244659424
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/scripts/offline_inference_mamba.py b/scripts/offline_inference_mamba.py
index 151b0bf84..3a09ff639 100644
--- a/scripts/offline_inference_mamba.py
+++ b/scripts/offline_inference_mamba.py
@@ -44,7 +44,7 @@
 
     llm = LLM(
         model=f"{os.environ["MY_MODEL_PATH"]}",
-        # enforce_eager=True,
+        enforce_eager=True,
         enable_chunked_prefill=True,
         enable_prefix_caching=False,
         tensor_parallel_size=2,
@@ -61,7 +61,7 @@
 
     prompts = [
         "Zurich is a beautiful city with",
-        # "San Francisco is a large city with",
+        "San Francisco is a large city with",
         # "Provide a list of instructions for preparing chicken soup for a family "
         # "of four.",
         # "Skating and cross country skiing technique differ in",
diff --git a/vllm b/vllm
index a9019b760..3fd1aaf67 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit a9019b7608d258db59651bcf7678c0e291a6d4ce
+Subproject commit 3fd1aaf670583ab00befd473b450ca2b96b92f23

From 02234769100657e1d37679aa02f686ad6fd98333 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Fri, 18 Jul 2025 12:45:13 +0200
Subject: [PATCH 05/61] adding cache to moe config script

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/dejavu-to-moe_configs.py | 125 +++++++++++++++++++++++++++++++
 vllm                             |   2 +-
 2 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 scripts/dejavu-to-moe_configs.py

diff --git a/scripts/dejavu-to-moe_configs.py b/scripts/dejavu-to-moe_configs.py
new file mode 100644
index 000000000..93ba227ae
--- /dev/null
+++ b/scripts/dejavu-to-moe_configs.py
@@ -0,0 +1,125 @@
+
+import sys
+import os
+import json
+
+# __vllm_base_path__ = '/home/zrlngl/watsonx/vllm/vllm/model_executor/layers/fused_moe/configs/'
+# __vllm_base_path__ = '/home/zrlngl/watsonx/vllm/ngl_configs/'
+__vllm_base_path__ = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../ngl_configs/'))
+
+moe_keys = [
+        'N', 
+        'K',
+        'E', 
+        # 'EM',
+        'num_valid_tokens',
+        'num_actual_tokens',
+        'stride_am',
+        'stride_ak',
+        'stride_be',
+        'stride_bk',
+        'stride_bn',
+        'stride_cm',
+        'stride_cn', 
+        'MUL_ROUTED_WEIGHT',
+        'top_k',
+        'compute_type',
+        'use_fp8_w8a8',
+        'use_int8_w8a16',
+    ]
+__skip_config_args__ = ["enable_persistent", "maxnreg"]
+
+
+def moe_key_to_param_dict(k):
+    kl = k[1:-1].split(', ')
+    ret = {}
+    for i, label in enumerate(moe_keys):
+        ret[label] = kl[i]
+    return ret
+
+
+def create_config_dict(v):
+    # for vLLM specific
+    vlist = v.split(", ")
+    ret = {}
+    for e in vlist:
+        sl = e.split(": ")
+        if sl[0] in __skip_config_args__:
+            continue
+        ret[sl[0]] = int(sl[1])
+    return ret
+
+
+def translate_dejavu_cache(cache_path):
+    print(f"Exporting {cache_path} to {__vllm_base_path__}...")
+    # tag_path = os.path.dirname(cache_path)
+    # gpu_name_path = os.path.dirname(os.path.dirname(tag_path[:-2])[:-2])
+    # gpu_name = os.path.basename(gpu_name_path)[4:]
+    # adapt to new structure
+    path_ids = os.path.abspath(cache_path).split('/')
+    gpu_name_path = path_ids[-7]
+    gpu_name = gpu_name_path[4:]
+
+    with open(cache_path, 'r') as f:
+        dejavu_cache = json.load(f)
+
+    cache_dict = dejavu_cache['cache']
+    
+    # k0 = list(cache_dict.keys())[0]
+    # v0 = cache_dict[k0]
+    num_experts = None
+
+    config_per_device = {}
+    timings_per_device = {}
+    for k, v in cache_dict.items():
+        kd = moe_key_to_param_dict(k)
+        vd = create_config_dict(v)
+        ot = dejavu_cache['timings'][k]['values'][dejavu_cache['timings'][k]['lables'].index('ms')]
+        if num_experts is None:
+            num_experts = int(kd['E'][1:-1])
+        else:
+            assert num_experts == int(kd['E'][1:-1])
+        # num_tokens = int(kd['num_valid_tokens'][1:-1])
+        # TODO: how to automatically determine /2? update method signature?
+        # num_tokens = int(int(kd['num_valid_tokens'][1:-1]) / 2)
+        num_tokens = int(kd['num_actual_tokens'][1:-1])
+        # N = int(kd['N'][1:-1])/num_tokens
+        # N = int(kd['N'][1:-1])
+        # vllm_N = int(kd['stride_am'][1:-1])
+        vllm_N = int(kd['K'][1:-1])
+        # N = int(kd['stride_am'][1:-1])/2  # due to test script shape generation?
+        new_dict = {num_tokens: vd}
+        if vllm_N not in config_per_device:
+            config_per_device[vllm_N] = new_dict
+            timings_per_device[vllm_N] = {num_tokens: ot}
+        else:
+            # config_per_device[vllm_N].update(new_dict)
+            if num_tokens not in config_per_device[vllm_N]:
+                config_per_device[vllm_N][num_tokens] = vd
+                timings_per_device[vllm_N][num_tokens] = ot
+            else:
+                if ot >= timings_per_device[vllm_N][num_tokens]:
+                    print(f"configuration for {num_tokens} already existent: {config_per_device[vllm_N][num_tokens]}; "
+                         f"would overwrite with {vd} but is SLOWER, skipping...")
+                else:
+                    print(f"configuration for {num_tokens} already existent: {config_per_device[vllm_N][num_tokens]}; "
+                         f"overwrite with {vd} because it is FASTER...")
+                    config_per_device[vllm_N][num_tokens] = vd
+                    timings_per_device[vllm_N][num_tokens] = ot
+
+    modified_paths = []
+    for N, config_dict in config_per_device.items():    
+        file_name = f"E={int(num_experts)},N={int(N)},device_name={gpu_name}.json"
+        modified_paths.append(file_name)
+        target_path = os.path.abspath(f"{__vllm_base_path__}/{file_name}")
+        # num_tokens / M as key in dict
+        with open(target_path, 'w') as f:
+            json.dump(config_dict, f, indent=4)
+
+    print(f"modified the following files: {modified_paths}")
+    print(f"triton-dejavu has saved {dejavu_cache['total_bench_time_s']}s")
+    print('...done')
+
+
+if __name__ == '__main__':
+    translate_dejavu_cache(sys.argv[1])
diff --git a/vllm b/vllm
index 3fd1aaf67..d91278181 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit 3fd1aaf670583ab00befd473b450ca2b96b92f23
+Subproject commit d91278181d89686b73b2ec88c2db4d55c6c506cb

From 75f387713ffff3b9612b049657a713f67098d488 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Fri, 18 Jul 2025 10:50:50 +0000
Subject: [PATCH 06/61] moving fmwork to third_party

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .gitmodules                                            | 3 +++
 Dockerfile                                             | 4 ++--
 third_party/fmwork                                     | 1 +
 third_party/{ => kernels}/vedantroy_paged_attention.py | 0
 4 files changed, 6 insertions(+), 2 deletions(-)
 create mode 160000 third_party/fmwork
 rename third_party/{ => kernels}/vedantroy_paged_attention.py (100%)

diff --git a/.gitmodules b/.gitmodules
index 9880c2f9b..a3b1beaed 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "vllm"]
 	path = vllm
 	url = https://github.com/vllm-project/vllm.git
+[submodule "third_party/fmwork"]
+	path = third_party/fmwork
+	url = https://github.com/IBM/fmwork.git
diff --git a/Dockerfile b/Dockerfile
index 5fb522a29..0e5fe148b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -300,7 +300,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 # RUN git clone --depth 1 https://github.com/IBM/fmwork.git
 # RUN git clone --depth 1 https://github.com/IBM/fmwork.git
-COPY fmwork fmwork
+COPY third_party/fmwork fmwork
 
 ENV STORE_TEST_RESULT_PATH=/results
 
@@ -310,7 +310,7 @@ COPY vllm/tests tests
 COPY ShareGPT_V3_unfiltered_cleaned_split.json ShareGPT_V3_unfiltered_cleaned_split.json
 
 # Copy thid-party kernels and insert into path
-COPY third_party third_party
+COPY third_party/kernels third_party
 ENV PYTHONPATH /workspace
 
 # see https://github.com/IBM/triton-dejavu?tab=readme-ov-file#environment-variables
diff --git a/third_party/fmwork b/third_party/fmwork
new file mode 160000
index 000000000..846345f3c
--- /dev/null
+++ b/third_party/fmwork
@@ -0,0 +1 @@
+Subproject commit 846345f3c5f8f0f42a7dbfbc297ed5cd66f09ece
diff --git a/third_party/vedantroy_paged_attention.py b/third_party/kernels/vedantroy_paged_attention.py
similarity index 100%
rename from third_party/vedantroy_paged_attention.py
rename to third_party/kernels/vedantroy_paged_attention.py

From da56c2e3c5d190efd4bb1f72e0c981be1f80a274 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Fri, 18 Jul 2025 10:51:07 +0000
Subject: [PATCH 07/61] benchmark moe fallback heuristics

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/high_qps_bench.sh | 25 +++++++++++++++++++++++++
 vllm                      |  2 +-
 2 files changed, 26 insertions(+), 1 deletion(-)
 create mode 100755 scripts/high_qps_bench.sh

diff --git a/scripts/high_qps_bench.sh b/scripts/high_qps_bench.sh
new file mode 100755
index 000000000..271b87d10
--- /dev/null
+++ b/scripts/high_qps_bench.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+#  uv pip install pandas datasets numpy
+
+# MODEL=meta-llama/Llama-3.1-8B-Instruct
+# MODEL=/net/storage149/autofs/css22/nmg/models/hf/ibm-granite/granite-4.0-tiny-preview/main/
+MODEL=/net/storage149/autofs/css22/nmg/models/hf/ibm-ai-platform/Bamba-9B-v1/main/
+REQUEST_RATES=(20 20 20)
+TOTAL_SECONDS=120
+
+for REQUEST_RATE in "${REQUEST_RATES[@]}";
+do
+        NUM_PROMPTS=$(($TOTAL_SECONDS * $REQUEST_RATE))
+        echo ""
+        echo "===== RUNNING $MODEL FOR $NUM_PROMPTS PROMPTS WITH $REQUEST_RATE QPS ====="
+        echo ""
+        python3 vllm-triton-backend/vllm/benchmarks/benchmark_serving.py \
+                --model $MODEL \
+                --dataset-name random \
+                --ignore-eos \
+                --num-prompts $NUM_PROMPTS \
+                --request-rate $REQUEST_RATE \
+                --port 8803 \
+        ;
+done;
diff --git a/vllm b/vllm
index d91278181..5be130601 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit d91278181d89686b73b2ec88c2db4d55c6c506cb
+Subproject commit 5be1306019f7622b86c8f8aedfea477be83d4a21

From fd8531c2fbf75df4b0fab44658c1ad44c64892df Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Fri, 18 Jul 2025 13:42:27 +0000
Subject: [PATCH 08/61] implemented fused moe benchmark

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../ibm_triton_lib/kernels/__init__.py        |    2 +
 .../default/cache.json                        |    8 +
 .../default/cache.json                        |    8 +
 .../ibm_triton_lib/kernels/fused_moe.py       | 1814 +++++++++++++++++
 scripts/benchmark.py                          |  231 ++-
 scripts/setups/granite4_moe_0.conf            |   20 +
 6 files changed, 2082 insertions(+), 1 deletion(-)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 create mode 100644 ibm-triton-lib/ibm_triton_lib/kernels/fused_moe.py
 create mode 100644 scripts/setups/granite4_moe_0.conf

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py b/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
index a78522fc2..2a97f6023 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
@@ -69,3 +69,5 @@ def ConfigSpace(
 from .triton_unified_attention import unified_attention
 
 from .mamba_ssm import selective_state_update
+
+from .fused_moe import fused_moe
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
new file mode 100755
index 000000000..c2b3452bf
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
new file mode 100755
index 000000000..a7c2af725
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/fused_moe.py b/ibm-triton-lib/ibm_triton_lib/kernels/fused_moe.py
new file mode 100644
index 000000000..2276a8713
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/fused_moe.py
@@ -0,0 +1,1814 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Fused MoE kernel."""
+import functools
+import json
+import os
+from typing import Any, Callable, Optional
+
+import torch
+
+import vllm.envs as envs
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+# yapf: disable
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEQuantConfig, get_config_quant_dtype)
+from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+    _valid_cutlass_block_scaled_grouped_gemm,
+    run_cutlass_block_scaled_fused_experts)
+# yapf: enable
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    _valid_deep_gemm, deep_gemm_moe_fp8)
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP)
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceNoOP)
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache, moe_kernel_quantize_input)
+from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
+    dequant_mxfp4)
+from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+from vllm.utils import direct_register_custom_op
+from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
+
+# from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
+
+logger = init_logger(__name__)
+
+
+@triton.jit
+def write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, offs_token,
+                          token_mask, BLOCK_SIZE_M, BLOCK_SIZE_N,
+                          compute_type):
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=compute_type)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
+        None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+@triton.jit
+def fused_moe_kernel_gptq_awq(
+        # Pointers to matrices
+        a_ptr,
+        b_ptr,
+        c_ptr,
+        b_scale_ptr,
+        b_zp_ptr,
+        topk_weights_ptr,
+        sorted_token_ids_ptr,
+        expert_ids_ptr,
+        num_tokens_post_padded_ptr,
+        # Matrix dimensions
+        N: tl.constexpr,
+        K: tl.constexpr,
+        EM,
+        num_valid_tokens,
+        # The stride variables represent how much to increase the ptr by when
+        # moving by 1 element in a particular dimension. E.g. `stride_am` is
+        # how much to increase `a_ptr` by to get the element one row down
+        # (A has M rows).
+        stride_am,
+        stride_ak,
+        stride_be,
+        stride_bk,
+        stride_bn,
+        stride_cm,
+        stride_cn,
+        stride_bse,
+        stride_bsk,
+        stride_bsn,
+        stride_bze,
+        stride_bzk,
+        stride_bzn,
+        block_k_diviable: tl.constexpr,
+        group_size: tl.constexpr,
+        # Meta-parameters
+        BLOCK_SIZE_M: tl.constexpr,
+        BLOCK_SIZE_N: tl.constexpr,
+        BLOCK_SIZE_K: tl.constexpr,
+        GROUP_SIZE_M: tl.constexpr,
+        MUL_ROUTED_WEIGHT: tl.constexpr,
+        top_k: tl.constexpr,
+        compute_type: tl.constexpr,
+        has_zp: tl.constexpr,
+        use_int4_w4a16: tl.constexpr,
+        use_int8_w8a16: tl.constexpr):
+    """
+    Implements the fused computation for a Mixture of Experts (MOE) using
+    token and expert matrices.
+
+    Key Parameters:
+    - A: The input tensor representing tokens with shape (*, K), where '*' can
+        be any shape representing batches and K is the feature dimension of
+        each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+        the number of experts, K is the input feature dimension, and N is
+        the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the
+        total number of tokens post padding, topk is the number of times
+        each token is repeated, and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens,
+        repeated topk times and arranged by the expert index they are
+        assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each
+        block. It determines which expert matrix from B should be used for
+        each block in A.
+    This kernel performs the multiplication of a token by its corresponding
+    expert matrix as determined by `expert_ids`. The sorting of
+    `sorted_token_ids` by expert index and padding ensures divisibility by
+    BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+    multiplication across different blocks processed by the same expert.
+    """
+    # -----------------------------------------------------------
+    # Map program ids `pid` to the block of C it should compute.
+    # This is done in a grouped ordering to promote L2 data reuse.
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # ----------------------------------------------------------
+    # Create pointers for the first blocks of A and B.
+    # We will advance this pointer as we move in the K direction
+    # and accumulate
+    # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+    # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+        return
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(
+        tl.int64)
+    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+    token_mask = offs_token < num_valid_tokens
+
+    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+    if off_experts == -1:
+        # -----------------------------------------------------------
+        # Write back zeros to the output when the expert is not
+        # in the current expert parallel rank.
+        write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N,
+                              offs_token, token_mask, BLOCK_SIZE_M,
+                              BLOCK_SIZE_N, compute_type)
+        return
+
+    offs_bn = (pid_n * BLOCK_SIZE_N +
+               tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
+                      offs_k[None, :] * stride_ak)
+
+    if use_int4_w4a16:
+        b_ptrs = b_ptr + off_experts * stride_be + \
+            (offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * \
+                stride_bn
+        b_shifter = (offs_k[:, None] % 2) * 4
+    elif use_int8_w8a16:
+        b_ptrs = b_ptr + off_experts * stride_be + \
+            offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn
+
+    if not has_zp and use_int4_w4a16:
+        b_zp_num = 8
+    if not has_zp and use_int8_w8a16:
+        b_zp_num = 128
+    elif has_zp and use_int4_w4a16:
+        b_zp_shifter = (offs_bn[None, :] % 2) * 4
+
+    # -----------------------------------------------------------
+    # Iterate to compute a block of the C matrix.
+    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+    # of fp32 values for higher accuracy.
+    # `accumulator` will be converted back to fp16 after the loop.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # Load the next block of A and B, generate a mask by checking the
+        # K dimension.
+
+        if not block_k_diviable:
+            k_mask = offs_k[:, None] < K - k * BLOCK_SIZE_K
+            k_other = 0.0
+        else:
+            k_mask = None
+            k_other = None
+
+        a = tl.load(a_ptrs,
+                    mask=token_mask[:, None] &
+                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                    other=0.0)
+        b = tl.load(b_ptrs)
+        if use_int4_w4a16:
+            b = (b >> b_shifter) & 0xF
+
+        b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + \
+            offs_bn[None, :] * stride_bsn + \
+            ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * \
+                stride_bsk
+        b_scale = tl.load(b_scale_ptrs, mask=k_mask, other=k_other)
+        b_scale = b_scale.to(tl.float32)
+
+        if has_zp and use_int4_w4a16:
+            offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
+            b_zp_ptrs = b_zp_ptr + off_experts * stride_bze + \
+                (offs_bn[None, :] // 2) * stride_bzn + \
+                offs_k_true * stride_bzk
+            b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
+            b_zp = ((b_zp >> b_zp_shifter) & 0xF)
+            b_zp = b_zp.to(tl.float32)
+        elif has_zp and use_int8_w8a16:
+            offs_k_true = (offs_k[:, None] + BLOCK_SIZE_K * k) // group_size
+            b_zp_ptrs = b_zp_ptr + off_experts * stride_bze + \
+                offs_bn[None, :] * stride_bzn + \
+                offs_k_true * stride_bzk
+            b_zp = tl.load(b_zp_ptrs, mask=k_mask, other=k_other)
+            b_zp = b_zp.to(tl.float32)
+
+        # We accumulate along the K dimension.
+        if has_zp:
+            b = ((b.to(tl.float32) - b_zp) * b_scale).to(compute_type)
+        else:
+            b = ((b.to(tl.float32) - b_zp_num) * b_scale).to(compute_type)
+        accumulator = tl.dot(a, b, acc=accumulator)
+
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        if use_int4_w4a16:
+            b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
+        else:
+            b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token,
+                             mask=token_mask,
+                             other=0)
+        accumulator = accumulator * moe_weight[:, None]
+
+    accumulator = accumulator.to(compute_type)
+    # -----------------------------------------------------------
+    # Write back the block of the output
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
+        None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+@triton.jit
+def fused_moe_kernel(
+    # Pointers to matrices
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    a_scale_ptr,
+    b_scale_ptr,
+    topk_weights_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    num_tokens_post_padded_ptr,
+    # Matrix dimensions
+    N,
+    K,
+    EM,
+    num_valid_tokens,
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_asm,
+    stride_ask,
+    stride_bse,
+    stride_bsk,
+    stride_bsn,
+    # Block size for block-wise quantization
+    group_n: tl.constexpr,
+    group_k: tl.constexpr,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    MUL_ROUTED_WEIGHT: tl.constexpr,
+    top_k: tl.constexpr,
+    compute_type: tl.constexpr,
+    use_fp8_w8a8: tl.constexpr,
+    use_int8_w8a8: tl.constexpr,
+    use_int8_w8a16: tl.constexpr,
+    per_channel_quant: tl.constexpr,
+):
+    """
+    Implements the fused computation for a Mixture of Experts (MOE) using
+    token and expert matrices.
+
+    Key Parameters:
+    - A: The input tensor representing tokens with shape (*, K), where '*' can
+        be any shape representing batches and K is the feature dimension of
+        each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+        the number of experts, K is the input feature dimension, and N is
+        the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the
+        total number of tokens post padding, topk is the number of times
+        each token is repeated, and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens,
+        repeated topk times and arranged by the expert index they are
+        assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each
+        block. It determines which expert matrix from B should be used for
+        each block in A.
+    This kernel performs the multiplication of a token by its corresponding
+    expert matrix as determined by `expert_ids`. The sorting of
+    `sorted_token_ids` by expert index and padding ensures divisibility by
+    BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+    multiplication across different blocks processed by the same expert.
+    """
+    # -----------------------------------------------------------
+    # Map program ids `pid` to the block of C it should compute.
+    # This is done in a grouped ordering to promote L2 data reuse.
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # ----------------------------------------------------------
+    # Create pointers for the first blocks of A and B.
+    # We will advance this pointer as we move in the K direction
+    # and accumulate
+    # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+    # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+        return
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(
+        tl.int64)
+    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+    token_mask = offs_token < num_valid_tokens
+
+    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+    if off_experts == -1:
+        # -----------------------------------------------------------
+        # Write back zeros to the output when the expert is not
+        # in the current expert parallel rank.
+        write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N,
+                              offs_token, token_mask, BLOCK_SIZE_M,
+                              BLOCK_SIZE_N, compute_type)
+        return
+
+    offs_bn = (pid_n * BLOCK_SIZE_N +
+               tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
+                      offs_k[None, :] * stride_ak)
+
+    b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
+                                                offs_bn[None, :] * stride_bn)
+    if use_int8_w8a16:
+        b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
+            None, :] * stride_bsn
+        b_scale = tl.load(b_scale_ptrs)
+
+    if use_fp8_w8a8 or use_int8_w8a8:
+        # block-wise
+        if group_k > 0 and group_n > 0:
+            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+            offs_bsn = offs_bn // group_n
+            b_scale_ptrs = (b_scale_ptr + off_experts * stride_bse +
+                            offs_bsn * stride_bsn)
+        # channel-wise
+        elif per_channel_quant:
+            b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
+                None, :] * stride_bsn
+            b_scale = tl.load(b_scale_ptrs)
+            # Load per-token scale for activations
+            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+            a_scale = tl.load(a_scale_ptrs, mask=token_mask, other=0.0)[:,
+                                                                        None]
+        # tensor-wise
+        else:
+            a_scale = tl.load(a_scale_ptr)
+            b_scale = tl.load(b_scale_ptr + off_experts)
+
+    # -----------------------------------------------------------
+    # Iterate to compute a block of the C matrix.
+    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+    # of fp32 values for higher accuracy.
+    # `accumulator` will be converted back to fp16 after the loop.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # Load the next block of A and B, generate a mask by checking the
+        # K dimension.
+        a = tl.load(a_ptrs,
+                    mask=token_mask[:, None] &
+                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
+        # We accumulate along the K dimension.
+        if use_int8_w8a16:
+            accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
+        elif use_fp8_w8a8 or use_int8_w8a8:
+            if group_k > 0 and group_n > 0:
+                k_start = k * BLOCK_SIZE_K
+                offs_ks = k_start // group_k
+                a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask,
+                                  mask=token_mask,
+                                  other=0.0)
+                b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
+
+                accumulator += tl.dot(a, b) * a_scale[:,
+                                                      None] * b_scale[None, :]
+            else:
+                if use_fp8_w8a8:
+                    # acc used to enable fp8_fast_accum
+                    accumulator = tl.dot(a, b, acc=accumulator)
+                else:
+                    accumulator += tl.dot(a, b)
+        else:
+            accumulator += tl.dot(a, b)
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token,
+                             mask=token_mask,
+                             other=0)
+        accumulator = accumulator * moe_weight[:, None]
+    if use_int8_w8a16:
+        accumulator = (accumulator * b_scale).to(compute_type)
+    elif use_fp8_w8a8 or use_int8_w8a8:
+        if group_k > 0 and group_n > 0:
+            accumulator = accumulator.to(compute_type)
+        else:
+            accumulator = (accumulator * a_scale * b_scale).to(compute_type)
+    else:
+        accumulator = accumulator.to(compute_type)
+    # -----------------------------------------------------------
+    # Write back the block of the output
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
+        None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+def invoke_fused_moe_kernel(A: torch.Tensor,
+                            B: torch.Tensor,
+                            C: torch.Tensor,
+                            A_scale: Optional[torch.Tensor],
+                            B_scale: Optional[torch.Tensor],
+                            B_zp: Optional[torch.Tensor],
+                            topk_weights: Optional[torch.Tensor],
+                            sorted_token_ids: torch.Tensor,
+                            expert_ids: torch.Tensor,
+                            num_tokens_post_padded: torch.Tensor,
+                            mul_routed_weight: bool,
+                            top_k: int,
+                            config: dict[str, Any],
+                            compute_type: tl.dtype,
+                            use_fp8_w8a8: bool,
+                            use_int8_w8a8: bool,
+                            use_int8_w8a16: bool,
+                            use_int4_w4a16: bool,
+                            per_channel_quant: bool,
+                            block_shape: Optional[list[int]] = None) -> None:
+    assert topk_weights is not None or not mul_routed_weight
+    assert topk_weights is None or topk_weights.stride(1) == 1
+    assert sorted_token_ids.stride(0) == 1
+
+    if use_fp8_w8a8 or use_int8_w8a8:
+        assert B_scale is not None
+        assert (block_shape is None
+                or triton.cdiv(B.size(-2), block_shape[0]) == B_scale.size(-2))
+        assert (block_shape is None
+                or triton.cdiv(B.size(-1), block_shape[1]) == B_scale.size(-1))
+
+    elif use_int8_w8a16 or use_int4_w4a16:
+        assert B_scale is not None
+        assert block_shape is None or block_shape[0] == 0
+    else:
+        assert A_scale is None
+        assert B_scale is None
+
+    M = A.size(0)
+    num_tokens = M * top_k
+
+    EM = sorted_token_ids.size(0)
+    if A.size(0) < config["BLOCK_SIZE_M"]:
+        # optimize for small batch_size.
+        # We assume that top_ids of each token is unique, so
+        # so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
+        # and we can skip some invalid blocks.
+        EM = min(sorted_token_ids.size(0),
+                 A.size(0) * top_k * config['BLOCK_SIZE_M'])
+    grid = lambda META: (triton.cdiv(EM, META['BLOCK_SIZE_M']) * triton.cdiv(
+        B.size(1), META['BLOCK_SIZE_N']), )
+
+    if (use_int8_w8a16 or use_int4_w4a16) and \
+            block_shape is not None and block_shape[1] > 0:
+        assert B_scale is not None and B_scale.ndim == 3
+        assert B_zp is None or B_zp.ndim == 3
+
+        use_moe_wna16_cuda = should_moe_wna16_use_cuda(
+            num_valid_tokens=num_tokens,
+            group_size=block_shape[1],
+            num_experts=B.size(0),
+            bit=4 if use_int4_w4a16 else 8)
+        config = config.copy()
+        config.update(
+            get_moe_wna16_block_config(config=config,
+                                       use_moe_wna16_cuda=use_moe_wna16_cuda,
+                                       num_valid_tokens=num_tokens,
+                                       size_k=A.size(1),
+                                       size_n=B.size(1),
+                                       num_experts=B.size(1),
+                                       group_size=block_shape[1],
+                                       real_top_k=top_k,
+                                       block_size_m=config["BLOCK_SIZE_M"]))
+
+        if use_moe_wna16_cuda:
+            bit = 4 if use_int4_w4a16 else 8
+            ops.moe_wna16_gemm(A, C, B, B_scale, B_zp,
+                               topk_weights if mul_routed_weight else None,
+                               sorted_token_ids, expert_ids,
+                               num_tokens_post_padded, top_k,
+                               config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"],
+                               config["BLOCK_SIZE_K"], bit)
+            return
+
+        fused_moe_kernel_gptq_awq[grid](
+            A,
+            B,
+            C,
+            B_scale,
+            B_zp,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.size(1),
+            A.size(1),
+            EM,
+            num_tokens,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(1),
+            C.stride(2),
+            B_scale.stride(0),
+            B_scale.stride(2),
+            B_scale.stride(1),
+            B_zp.stride(0) if B_zp is not None else 0,
+            B_zp.stride(2) if B_zp is not None else 0,
+            B_zp.stride(1) if B_zp is not None else 0,
+            block_k_diviable=A.size(1) % config["BLOCK_SIZE_K"] == 0,
+            group_size=block_shape[1],
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            has_zp=B_zp is not None,
+            use_int4_w4a16=use_int4_w4a16,
+            use_int8_w8a16=use_int8_w8a16,
+            **config,
+        )
+    else:
+        config = config.copy()
+        BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K")
+        if block_shape is not None:
+            BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0],
+                                                 block_shape[1]))
+        fused_moe_kernel[grid](
+            A,
+            B,
+            C,
+            A_scale,
+            B_scale,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            B.size(1),
+            B.size(2),
+            EM,
+            num_tokens,
+            A.stride(0),
+            A.stride(1),
+            B.stride(0),
+            B.stride(2),
+            B.stride(1),
+            C.stride(1),
+            C.stride(2),
+            A_scale.stride(0)
+            if A_scale is not None and A_scale.ndim == 2 else 0,
+            A_scale.stride(1)
+            if A_scale is not None and A_scale.ndim == 2 else 0,
+            B_scale.stride(0)
+            if B_scale is not None and B_scale.ndim >= 2 else 0,
+            B_scale.stride(2)
+            if B_scale is not None and B_scale.ndim == 3 else 0,
+            B_scale.stride(1)
+            if B_scale is not None and B_scale.ndim >= 2 else 0,
+            0 if block_shape is None else block_shape[0],
+            0 if block_shape is None else block_shape[1],
+            MUL_ROUTED_WEIGHT=mul_routed_weight,
+            top_k=top_k,
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            per_channel_quant=per_channel_quant,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            **config,
+        )
+
+
+# Adapted from: https://github.com/sgl-project/sglang/pull/2628
+def get_config_file_name(E: int,
+                         N: int,
+                         dtype: Optional[str],
+                         block_shape: Optional[list[int]] = None) -> str:
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    dtype_selector = "" if not dtype else f",dtype={dtype}"
+    block_shape_selector = ("" if not block_shape or not all(block_shape) else
+                            f",block_shape={block_shape}").replace(" ", "")
+    return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json"  # noqa: E501
+
+
+# Adapted from: https://github.com/sgl-project/sglang/pull/2628
+@functools.lru_cache
+def get_moe_configs(
+    E: int,
+    N: int,
+    dtype: Optional[str],
+    block_n: Optional[int] = None,
+    block_k: Optional[int] = None,
+) -> Optional[dict[int, Any]]:
+    """
+    Return optimized configurations for the fused MoE kernel.
+
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the fused_moe kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    block_shape = [block_n, block_k] if block_n and block_k else None
+    json_file_name = get_config_file_name(E, N, dtype, block_shape)
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            logger.info("Using configuration from %s for MoE layer.",
+                        config_file_path)
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+
+    if envs.VLLM_ENABLE_FUSED_MOE_CONFIG_HEURISTICS:
+        config_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs")
+        all_config_files = [f for f in os.listdir(config_folder) if os.path.isfile(os.path.join(config_folder, f))]
+        applicable_name_part = config_file_path.split(",device")[1]
+        all_applicable_files = [f for f in all_config_files if applicable_name_part in f]
+        # N given E
+        available_E = list(set([int(f.split("E=")[1].split(",N=")[0]) for f in all_applicable_files]))
+        next_best_e = min(available_E, key=lambda x: abs(x - E))
+        all_applicable_n = [f for f in all_applicable_files if f"E={next_best_e}" in f]
+        available_N_given_e = list(set([int(f.split("N=")[1].split(",device")[0]) for f in all_applicable_n]))
+        next_best_n = min(available_N_given_e, key=lambda x: abs(x - N))
+        # E given N
+        # available_N = list(set([int(f.split("N=")[1].split(",device")[0]) for f in all_applicable_files]))
+        # next_best_n = min(available_N, key=lambda x: abs(x - N))
+        # all_applicable_e = [f for f in all_applicable_files if f"N={next_best_n}" in f]
+        # available_E_given_n = list(set([int(f.split("E=")[1].split(",N=")[0]) for f in all_applicable_e]))
+        # next_best_e = min(available_E_given_n, key=lambda x: abs(x - E))
+        
+        fallback_json_file_name = get_config_file_name(next_best_e, next_best_n, dtype, block_shape)
+        fallback_config_file_path = os.path.join(
+            os.path.dirname(os.path.realpath(__file__)), "configs", fallback_json_file_name)
+        if os.path.exists(fallback_config_file_path):
+            with open(fallback_config_file_path) as f:
+                logger.warning(("Config file not found at %s. Trying to use next" \
+                               " best config at %s for MoE layer. Performance"
+                               " might still be sub-optimal."), 
+                               config_file_path, fallback_config_file_path)
+                return {int(key): val for key, val in json.load(f).items()}
+        
+    # If no optimized configuration is available (and heuristics is disabled),
+    # we will use the default configuration
+    logger.warning(
+        ("Using default MoE config. Performance might be sub-optimal! "
+         "Config file not found at %s"), config_file_path)
+    return None
+
+
+def get_moe_wna16_block_config(config: dict[str,
+                                            int], use_moe_wna16_cuda: bool,
+                               num_valid_tokens: int, size_k: int, size_n: int,
+                               num_experts: int, group_size: int,
+                               real_top_k: int, block_size_m: int):
+    if "BLOCK_SIZE_N" in config and "BLOCK_SIZE_K" in config:
+        # optimal block config is set
+        return {}
+    if not use_moe_wna16_cuda:
+        # triton moe wna16 kernel
+        if num_valid_tokens // real_top_k == 1:
+            # if bs=1, use a smaller BLOCK_SIZE_N
+            return {"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64}
+        else:
+            return {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}
+    else:
+        # cuda moe wna16 kernel
+        # set default block_size 128, and increase them when num_blocks
+        # is too large.
+        block_size_n = 128
+        block_size_k = 128
+        if block_size_k <= group_size:
+            block_size_k = group_size
+
+        num_n_blocks = size_k // block_size_k
+        num_k_blocks = size_n // block_size_k
+        num_m_blocks = (num_valid_tokens + block_size_m - 1) / block_size_m + \
+            num_experts
+        if num_valid_tokens // real_top_k <= block_size_m:
+            num_m_blocks = min(num_m_blocks, num_valid_tokens)
+        num_blocks = num_m_blocks * num_n_blocks * num_k_blocks
+
+        if size_k % 256 == 0 and num_blocks >= 256 and \
+                block_size_k < 256:
+            block_size_k = 256
+            num_blocks = num_blocks // (256 // block_size_k)
+
+        if num_m_blocks <= 16 and size_k % (block_size_k * 2) == 0 and \
+                size_k % (block_size_k * 2) == 0 and block_size_k <= 512 and \
+                num_blocks >= 512:
+            block_size_k = block_size_k * 2
+            num_blocks = num_blocks // 2
+
+        if num_blocks > 1024:
+            block_size_n = 256
+            num_n_blocks = num_n_blocks // 2
+            num_blocks = num_blocks // 2
+
+        if size_n <= 1024 and num_blocks >= 1024:
+            # The kernel performance got much better with BLOCK_SIZE_N=1024
+            # when num_blocks is large, event when N is small.
+            # Not sure why, maybe it force the CUDA SM process only one block
+            # at the same time.
+            block_size_n = 1024
+
+        return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k}
+
+
+def should_moe_wna16_use_cuda(num_valid_tokens: int, group_size: int,
+                              num_experts: int, bit: int):
+    return bit == 4 and group_size in [32, 64, 128] and \
+        num_valid_tokens / num_experts <= 6
+
+
+def get_default_config(
+    M: int,
+    E: int,
+    N: int,
+    K: int,
+    topk: int,
+    dtype: Optional[str],
+    is_marlin: bool,
+    block_shape: Optional[list[int]] = None,
+) -> dict[str, int]:
+    if dtype == "fp8_w8a8" and block_shape is not None:
+        # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
+        # BLOCK_SIZE_K must be divisible by block_shape[1]
+        # num_stages=3 can cause triton.runtime.errors.OutOfResources
+        # on ROCm, set it to 2 instead.
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_shape[0],
+            "BLOCK_SIZE_K": block_shape[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 3 if not current_platform.is_rocm() else 2,
+        }
+    elif dtype in ["int4_w4a16", "int8_w8a16"] and block_shape is not None:
+        # moe wna16 kernels
+        # only set BLOCK_SIZE_M
+        # BLOCK_SIZE_N and BLOCK_SIZE_K would be set later
+        bit = 4 if dtype == "int4_w4a16" else 8
+        use_moe_wna16_cuda = should_moe_wna16_use_cuda(M * topk,
+                                                       block_shape[1], E, bit)
+        if use_moe_wna16_cuda:
+            config = {"BLOCK_SIZE_M": min(16, M)}
+        elif M <= 20:
+            config = {"BLOCK_SIZE_M": 16, "GROUP_SIZE_M": 1}
+        elif M <= 40:
+            config = {"BLOCK_SIZE_M": 32, "GROUP_SIZE_M": 1}
+        else:
+            config = {"BLOCK_SIZE_M": 64, "GROUP_SIZE_M": 1}
+    elif is_marlin:
+        for block_size_m in [8, 16, 32, 48, 64]:
+            if M * topk / E / block_size_m < 0.9:
+                break
+        return {"BLOCK_SIZE_M": block_size_m}
+    elif M <= E:
+        config = {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 1,
+        }
+    else:
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+        }
+    return config
+
+
+def try_get_optimal_moe_config(
+    w1_shape: tuple[int, ...],
+    w2_shape: tuple[int, ...],
+    top_k: int,
+    dtype: Optional[str],
+    M: int,
+    is_marlin: bool = False,
+    block_shape: Optional[list[int]] = None,
+) -> dict[str, int]:
+    from vllm.model_executor.layers.fused_moe import get_config
+    override_config = get_config()
+    if override_config:
+        config = override_config
+    else:
+        # First try to load optimal config from the file
+        E, _, N = w2_shape
+        if dtype == "int4_w4a16":
+            N = N * 2
+        block_n = block_shape[0] if block_shape else 0
+        block_k = block_shape[1] if block_shape else 0
+        configs = get_moe_configs(E, N, dtype, block_n, block_k)
+
+        if configs:
+            # If an optimal configuration map has been found, look up the
+            # optimal config
+            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+        else:
+            # Else use the default config
+            config = get_default_config(M, E, N, w1_shape[2], top_k, dtype,
+                                        is_marlin, block_shape)
+    return config
+
+
+def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,
+                      token_expert_indices: torch.Tensor,
+                      gating_output: torch.Tensor,
+                      renormalize: bool) -> tuple[torch.Tensor, ...]:
+    ops.topk_softmax(
+        topk_weights,
+        topk_indices,
+        token_expert_indices,
+        gating_output,
+    )
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights, topk_indices
+
+
+def dispatch_topk_func() -> Callable[..., tuple[torch.Tensor, ...]]:
+    return vllm_topk_softmax
+
+
+def fused_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    indices_type: Optional[torch.dtype] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    assert hidden_states.size(0) == gating_output.size(0), (
+        "Number of tokens mismatch")
+
+    M, _ = hidden_states.size()
+
+    topk_weights = torch.empty(M,
+                               topk,
+                               dtype=torch.float32,
+                               device=hidden_states.device)
+    topk_ids = torch.empty(
+        M,
+        topk,
+        dtype=torch.int32 if indices_type is None else indices_type,
+        device=hidden_states.device)
+    token_expert_indices = torch.empty(M,
+                                       topk,
+                                       dtype=torch.int32,
+                                       device=hidden_states.device)
+
+    gating_output_float = gating_output.float()  # TODO(woosuk): Optimize this.
+
+    topk_func = dispatch_topk_func()
+    topk_weights, topk_ids = topk_func(topk_weights, topk_ids,
+                                       token_expert_indices,
+                                       gating_output_float, renormalize)
+
+    return topk_weights, topk_ids, token_expert_indices
+
+
+# This is used by the Deepseek-V2 and Deepseek-V3 model
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
+def grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    scoring_func: str = "softmax",
+    e_score_correction_bias: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+
+    assert hidden_states.size(0) == gating_output.size(0), (
+        "Number of tokens mismatch")
+
+    if scoring_func == "softmax":
+        scores = torch.softmax(gating_output, dim=-1)
+    elif scoring_func == "sigmoid":
+        scores = gating_output.sigmoid()
+    else:
+        raise ValueError(f"Unsupported scoring function: {scoring_func}")
+
+    num_token = scores.size(0)
+    if e_score_correction_bias is not None:
+        # Store original scores before applying correction bias. We use biased
+        # scores for expert selection but original scores for routing weights
+        original_scores = scores
+        scores = scores + e_score_correction_bias.unsqueeze(0)
+        group_scores = (scores.view(num_token, num_expert_group,
+                                    -1).topk(2, dim=-1)[0].sum(dim=-1))
+    else:
+        group_scores = scores.view(num_token, num_expert_group,
+                                   -1).max(dim=-1).values  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1,
+                           sorted=False)[1]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = group_mask.unsqueeze(-1).expand(
+        num_token, num_expert_group,
+        scores.size(-1) // num_expert_group).reshape(num_token, -1)  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(),
+                                    float("-inf"))  # [n, e]
+
+    if e_score_correction_bias is not None:
+        topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1]
+        # Use original unbiased scores for the routing weights
+        topk_weights = original_scores.gather(1, topk_ids)
+    else:
+        topk_weights, topk_ids = torch.topk(tmp_scores,
+                                            k=topk,
+                                            dim=-1,
+                                            sorted=False)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+
+
+def get_config_dtype_str(
+        dtype: torch.dtype,
+        use_int4_w4a16: Optional[bool] = False,
+        use_int8_w8a16: Optional[bool] = False,
+        use_fp8_w8a8: Optional[bool] = False,
+        use_mxfp4_w4a4: Optional[bool] = False) -> Optional[str]:
+    if use_fp8_w8a8:
+        return "fp8_w8a8"
+    elif use_int8_w8a16:
+        return "int8_w8a16"
+    elif use_int4_w4a16:
+        return "int4_w4a16"
+    elif use_mxfp4_w4a4:
+        return "mxfp4_w4a4"
+    elif dtype == torch.float:
+        # avoiding cases where kernel fails when float32 MoE
+        # use fp16/bfloat16 configs
+        return "float32"
+    return None
+
+
+# def inplace_fused_experts(hidden_states: torch.Tensor,
+#                           w1: torch.Tensor,
+#                           w2: torch.Tensor,
+#                           topk_weights: torch.Tensor,
+#                           topk_ids: torch.Tensor,
+#                           activation: str = "silu",
+#                           apply_router_weight_on_input: bool = False,
+#                           use_fp8_w8a8: bool = False,
+#                           use_int8_w8a8: bool = False,
+#                           use_int8_w8a16: bool = False,
+#                           use_int4_w4a16: bool = False,
+#                           use_mxfp4_w4a4: bool = False,
+#                           per_channel_quant: bool = False,
+#                           global_num_experts: int = -1,
+#                           expert_map: Optional[torch.Tensor] = None,
+#                           w1_scale: Optional[torch.Tensor] = None,
+#                           w2_scale: Optional[torch.Tensor] = None,
+#                           w1_zp: Optional[torch.Tensor] = None,
+#                           w2_zp: Optional[torch.Tensor] = None,
+#                           a1_scale: Optional[torch.Tensor] = None,
+#                           a2_scale: Optional[torch.Tensor] = None,
+#                           block_shape: Optional[list[int]] = None) -> None:
+#     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
+#                        activation, apply_router_weight_on_input, use_fp8_w8a8,
+#                        use_int8_w8a8, use_int8_w8a16, use_int4_w4a16,
+#                        use_mxfp4_w4a4, per_channel_quant, global_num_experts,
+#                        expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
+#                        a2_scale, block_shape)
+# 
+# 
+# def inplace_fused_experts_fake(
+#         hidden_states: torch.Tensor,
+#         w1: torch.Tensor,
+#         w2: torch.Tensor,
+#         topk_weights: torch.Tensor,
+#         topk_ids: torch.Tensor,
+#         activation: str = "silu",
+#         apply_router_weight_on_input: bool = False,
+#         use_fp8_w8a8: bool = False,
+#         use_int8_w8a8: bool = False,
+#         use_int8_w8a16: bool = False,
+#         use_int4_w4a16: bool = False,
+#         use_mxfp4_w4a4: bool = False,
+#         per_channel_quant: bool = False,
+#         global_num_experts: int = -1,
+#         expert_map: Optional[torch.Tensor] = None,
+#         w1_scale: Optional[torch.Tensor] = None,
+#         w2_scale: Optional[torch.Tensor] = None,
+#         w1_zp: Optional[torch.Tensor] = None,
+#         w2_zp: Optional[torch.Tensor] = None,
+#         a1_scale: Optional[torch.Tensor] = None,
+#         a2_scale: Optional[torch.Tensor] = None,
+#         block_shape: Optional[list[int]] = None) -> None:
+#     pass
+# 
+# 
+# direct_register_custom_op(
+#     op_name="inplace_fused_experts",
+#     op_func=inplace_fused_experts,
+#     mutates_args=["hidden_states"],
+#     fake_impl=inplace_fused_experts_fake,
+#     tags=(torch.Tag.needs_fixed_stride_order, ),
+# )
+# 
+# 
+# def outplace_fused_experts(
+#         hidden_states: torch.Tensor,
+#         w1: torch.Tensor,
+#         w2: torch.Tensor,
+#         topk_weights: torch.Tensor,
+#         topk_ids: torch.Tensor,
+#         activation: str = "silu",
+#         apply_router_weight_on_input: bool = False,
+#         use_fp8_w8a8: bool = False,
+#         use_int8_w8a8: bool = False,
+#         use_int8_w8a16: bool = False,
+#         use_int4_w4a16: bool = False,
+#         use_mxfp4_w4a4: bool = False,
+#         per_channel_quant: bool = False,
+#         global_num_experts: int = -1,
+#         expert_map: Optional[torch.Tensor] = None,
+#         w1_scale: Optional[torch.Tensor] = None,
+#         w2_scale: Optional[torch.Tensor] = None,
+#         w1_zp: Optional[torch.Tensor] = None,
+#         w2_zp: Optional[torch.Tensor] = None,
+#         a1_scale: Optional[torch.Tensor] = None,
+#         a2_scale: Optional[torch.Tensor] = None,
+#         block_shape: Optional[list[int]] = None) -> torch.Tensor:
+#     return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
+#                               False, activation, apply_router_weight_on_input,
+#                               use_fp8_w8a8, use_int8_w8a8, use_int8_w8a16,
+#                               use_int4_w4a16, use_mxfp4_w4a4,
+#                               per_channel_quant, global_num_experts,
+#                               expert_map, w1_scale, w2_scale, w1_zp, w2_zp,
+#                               a1_scale, a2_scale, block_shape)
+# 
+# 
+# def outplace_fused_experts_fake(
+#         hidden_states: torch.Tensor,
+#         w1: torch.Tensor,
+#         w2: torch.Tensor,
+#         topk_weights: torch.Tensor,
+#         topk_ids: torch.Tensor,
+#         activation: str = "silu",
+#         use_fp8_w8a8: bool = False,
+#         use_int8_w8a8: bool = False,
+#         use_int8_w8a16: bool = False,
+#         use_int4_w4a16: bool = False,
+#         use_mxfp4_w4a4: bool = False,
+#         per_channel_quant: bool = False,
+#         global_num_experts: int = -1,
+#         expert_map: Optional[torch.Tensor] = None,
+#         w1_scale: Optional[torch.Tensor] = None,
+#         w2_scale: Optional[torch.Tensor] = None,
+#         w1_zp: Optional[torch.Tensor] = None,
+#         w2_zp: Optional[torch.Tensor] = None,
+#         a1_scale: Optional[torch.Tensor] = None,
+#         a2_scale: Optional[torch.Tensor] = None,
+#         block_shape: Optional[list[int]] = None) -> torch.Tensor:
+#     return torch.empty_like(hidden_states)
+# 
+# 
+# direct_register_custom_op(
+#     op_name="outplace_fused_experts",
+#     op_func=outplace_fused_experts,
+#     mutates_args=[],
+#     fake_impl=outplace_fused_experts_fake,
+#     tags=(torch.Tag.needs_fixed_stride_order, ),
+# )
+# 
+# 
+# def torch_vllm_inplace_fused_experts(**kwargs) -> torch.Tensor:
+#     torch.ops.vllm.inplace_fused_experts(**kwargs)
+#     hidden_states = kwargs['hidden_states']
+#     return hidden_states
+# 
+# 
+# def torch_vllm_outplace_fused_experts(**kwargs) -> torch.Tensor:
+#     return torch.ops.vllm.outplace_fused_experts(**kwargs)
+# 
+# 
+# def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
+#     if inplace:
+#         return torch_vllm_inplace_fused_experts
+#     return torch_vllm_outplace_fused_experts
+
+
+# # TODO (bnell): replace this with modular op.  Can get rid of inplace/outplace
+# # torch ops.
+# def fused_experts(
+#         hidden_states: torch.Tensor,
+#         w1: torch.Tensor,
+#         w2: torch.Tensor,
+#         topk_weights: torch.Tensor,
+#         topk_ids: torch.Tensor,
+#         inplace: bool = False,
+#         activation: str = "silu",
+#         apply_router_weight_on_input: bool = False,
+#         use_fp8_w8a8: bool = False,
+#         use_int8_w8a8: bool = False,
+#         use_int8_w8a16: bool = False,
+#         use_int4_w4a16: bool = False,
+#         use_mxfp4_w4a4: bool = False,
+#         per_channel_quant: bool = False,
+#         global_num_experts: int = -1,
+#         expert_map: Optional[torch.Tensor] = None,
+#         w1_scale: Optional[torch.Tensor] = None,
+#         w2_scale: Optional[torch.Tensor] = None,
+#         w1_zp: Optional[torch.Tensor] = None,
+#         w2_zp: Optional[torch.Tensor] = None,
+#         a1_scale: Optional[torch.Tensor] = None,
+#         a2_scale: Optional[torch.Tensor] = None,
+#         block_shape: Optional[list[int]] = None,
+#         allow_deep_gemm: bool = False,
+#         allow_cutlass_block_scaled_grouped_gemm: bool = False) -> torch.Tensor:
+#     # For now, disable DeepGemm for small N (<= 512) until better
+#     # permute/unpermute ops are available.
+#     # However, on B200, we use DeepGemm for all cases because they only support
+#     # E8M0 scale, which means we requantize the weight and input to the specific
+#     # scale. Fallen back to cutlass or triton for some cases would cause
+#     # accuracy issue.
+#     N = w1.size(1)
+#     should_use_deep_gemm = ((N > 512
+#                              and _valid_deep_gemm(hidden_states, w1, w2))
+#                             or is_blackwell_deep_gemm_used())
+#     if (allow_deep_gemm and use_fp8_w8a8 and should_use_deep_gemm):
+#         assert apply_router_weight_on_input is False
+#         return deep_gemm_moe_fp8(
+#             hidden_states=hidden_states,
+#             w1=w1,
+#             w2=w2,
+#             topk_weights=topk_weights,
+#             topk_ids=topk_ids,
+#             inplace=inplace,
+#             activation=activation,
+#             global_num_experts=global_num_experts,
+#             expert_map=expert_map,
+#             w1_scale=w1_scale,
+#             w2_scale=w2_scale,
+#             a1_scale=a1_scale,
+#             a2_scale=a2_scale,
+#             apply_router_weight_on_input=apply_router_weight_on_input,
+#         )
+#     elif (allow_cutlass_block_scaled_grouped_gemm and use_fp8_w8a8
+#           and _valid_cutlass_block_scaled_grouped_gemm(
+#               w1, w2, inplace, activation, apply_router_weight_on_input,
+#               expert_map)):
+#         return run_cutlass_block_scaled_fused_experts(
+#             a=hidden_states,
+#             w1=w1,
+#             w2=w2,
+#             w1_scale=w1_scale,
+#             w2_scale=w2_scale,
+#             topk_weights=topk_weights,
+#             topk_ids=topk_ids)
+#     else:
+#         return dispatch_fused_experts_func(inplace)(
+#             hidden_states=hidden_states,
+#             w1=w1,
+#             w2=w2,
+#             topk_weights=topk_weights,
+#             topk_ids=topk_ids,
+#             activation=activation,
+#             apply_router_weight_on_input=apply_router_weight_on_input,
+#             use_fp8_w8a8=use_fp8_w8a8,
+#             use_int8_w8a8=use_int8_w8a8,
+#             use_int8_w8a16=use_int8_w8a16,
+#             use_int4_w4a16=use_int4_w4a16,
+#             use_mxfp4_w4a4=use_mxfp4_w4a4,
+#             per_channel_quant=per_channel_quant,
+#             global_num_experts=global_num_experts,
+#             expert_map=expert_map,
+#             w1_scale=w1_scale,
+#             w2_scale=w2_scale,
+#             w1_zp=w1_zp,
+#             w2_zp=w2_zp,
+#             a1_scale=a1_scale,
+#             a2_scale=a2_scale,
+#             block_shape=block_shape)
+
+
+def fused_experts_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    use_mxfp4_w4a4: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    # Check constraints.
+    if use_int4_w4a16:
+        assert hidden_states.size(1) // 2 == w1.size(2), (
+            "Hidden size mismatch")
+    elif use_mxfp4_w4a4:
+        # 16bit activation and fp4x2 packed weight
+        assert hidden_states.size(1) // 2 == w1.size(2), "hidden size mismatch"
+    else:
+        assert hidden_states.size(1) == w1.size(2), (
+            f"Hidden size mismatch {hidden_states.size(1)} != {w1.size(2)}")
+
+    assert topk_weights.size() == topk_ids.size(), "topk shape mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
+    assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
+
+    num_tokens = hidden_states.size(0)
+    E, N, _ = w1.size()
+    K = w2.size(1)
+    if global_num_experts == -1:
+        global_num_experts = E
+    top_k_num = topk_ids.size(1)
+    # We execute the fused_moe kernel in chunks to circumvent this issue:
+    # https://github.com/vllm-project/vllm/issues/5938
+    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
+    M = min(num_tokens, CHUNK_SIZE)
+    config_dtype = get_config_dtype_str(use_fp8_w8a8=use_fp8_w8a8,
+                                        use_int8_w8a16=use_int8_w8a16,
+                                        use_int4_w4a16=use_int4_w4a16,
+                                        use_mxfp4_w4a4=use_mxfp4_w4a4,
+                                        dtype=hidden_states.dtype)
+
+    qtype = get_config_quant_dtype(use_fp8_w8a8=use_fp8_w8a8,
+                                   use_int8_w8a8=use_int8_w8a8,
+                                   use_int8_w8a16=use_int8_w8a16,
+                                   use_int4_w4a16=use_int4_w4a16,
+                                   use_mxfp4_w4a4=use_mxfp4_w4a4)
+
+    get_config_func = functools.partial(
+        try_get_optimal_moe_config,
+        w1.size(),
+        w2.size(),
+        top_k_num,
+        config_dtype,
+        block_shape=block_shape,
+    )
+
+    config = get_config_func(M)
+
+    # We can reuse the memory between these because by the time we need
+    # cache3, we're done with cache1
+    cache13 = torch.empty(M * top_k_num * max(N, K),
+                          device=hidden_states.device,
+                          dtype=hidden_states.dtype)
+    intermediate_cache1 = cache13[:M * top_k_num * N].view(M, top_k_num, N)
+    intermediate_cache3 = cache13[:M * top_k_num * K].view(M, top_k_num, K)
+
+    # This needs separate memory since it's used concurrently with cache1
+    intermediate_cache2 = torch.empty((M * top_k_num, N // 2),
+                                      device=hidden_states.device,
+                                      dtype=hidden_states.dtype)
+
+    if hidden_states.dtype == torch.bfloat16:
+        compute_type = tl.bfloat16
+    elif hidden_states.dtype == torch.float16:
+        compute_type = tl.float16
+    elif hidden_states.dtype == torch.float32:
+        compute_type = tl.float32
+    else:
+        raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
+
+    if inplace:
+        out_hidden_states = hidden_states
+    else:
+        out_hidden_states = torch.empty_like(hidden_states)
+
+    if use_mxfp4_w4a4:
+        # Weight has to be dequantized for mxfp4 emulation.
+        w1 = dequant_mxfp4(w1, w1_scale, hidden_states.dtype)
+        w1_scale = None
+        w2 = dequant_mxfp4(w2, w2_scale, hidden_states.dtype)
+        w2_scale = None
+
+    for chunk in range((num_tokens // CHUNK_SIZE) + 1):
+        begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
+                                          min((chunk + 1) * CHUNK_SIZE,
+                                              num_tokens))
+        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
+        tokens_in_chunk, _ = curr_hidden_states.size()
+
+        if tokens_in_chunk == 0:
+            break
+
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+            # Adjust the intermediate cache size and config for the last
+            # chunk. Note that in most cases we only have one chunk
+            # so the cache size and config are already set correctly and
+            # do not need to be adjusted.
+            intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
+            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk *
+                                                      topk_ids.size(1)]
+            intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
+            config = get_config_func(tokens_in_chunk)
+
+        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
+        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
+        qcurr_hidden_states, a1q_scale = moe_kernel_quantize_input(
+            A=curr_hidden_states,
+            A_scale=a1_scale,
+            quant_dtype=qtype,
+            per_act_token_quant=per_channel_quant,
+            block_shape=block_shape)
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = (
+            moe_align_block_size(curr_topk_ids, config['BLOCK_SIZE_M'],
+                                 global_num_experts, expert_map))
+
+        invoke_fused_moe_kernel(qcurr_hidden_states,
+                                w1,
+                                intermediate_cache1,
+                                a1q_scale,
+                                w1_scale,
+                                w1_zp,
+                                curr_topk_weights,
+                                sorted_token_ids,
+                                expert_ids,
+                                num_tokens_post_padded,
+                                apply_router_weight_on_input,
+                                top_k_num,
+                                config,
+                                compute_type=compute_type,
+                                use_fp8_w8a8=use_fp8_w8a8,
+                                use_int8_w8a8=use_int8_w8a8,
+                                use_int8_w8a16=use_int8_w8a16,
+                                use_int4_w4a16=use_int4_w4a16,
+                                per_channel_quant=per_channel_quant,
+                                block_shape=block_shape)
+
+        if activation == "silu":
+            torch.ops._C.silu_and_mul(intermediate_cache2,
+                                      intermediate_cache1.view(-1, N))
+        elif activation == "gelu":
+            torch.ops._C.gelu_and_mul(intermediate_cache2,
+                                      intermediate_cache1.view(-1, N))
+        else:
+            raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+
+        qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
+            A=intermediate_cache2,
+            A_scale=a2_scale,
+            quant_dtype=qtype,
+            per_act_token_quant=per_channel_quant,
+            block_shape=block_shape)
+
+        invoke_fused_moe_kernel(qintermediate_cache2,
+                                w2,
+                                intermediate_cache3,
+                                a2q_scale,
+                                w2_scale,
+                                w2_zp,
+                                curr_topk_weights,
+                                sorted_token_ids,
+                                expert_ids,
+                                num_tokens_post_padded,
+                                not apply_router_weight_on_input,
+                                1,
+                                config,
+                                compute_type=compute_type,
+                                use_fp8_w8a8=use_fp8_w8a8,
+                                use_int8_w8a8=use_int8_w8a8,
+                                use_int8_w8a16=use_int8_w8a16,
+                                use_int4_w4a16=use_int4_w4a16,
+                                per_channel_quant=per_channel_quant,
+                                block_shape=block_shape)
+
+        ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.size()),
+                    out_hidden_states[begin_chunk_idx:end_chunk_idx])
+
+    return out_hidden_states
+
+
+def fused_moe(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    inplace: bool = False,
+    activation: str = "silu",
+    use_grouped_topk: bool = False,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
+    custom_routing_function: Optional[Callable] = None,
+    use_fp8_w8a8: bool = False,
+    use_int8_w8a8: bool = False,
+    use_int8_w8a16: bool = False,
+    use_int4_w4a16: bool = False,
+    use_mxfp4_w4a4: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+    - w1 (torch.Tensor): The first set of expert weights.
+    - w2 (torch.Tensor): The second set of expert weights.
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk (int): The number of top-k experts to select.
+    - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - activation (str): The activation function to apply after the first
+        MoE layer.
+    - num_expert_group: Optional[int]: additional parameter for grouped_topk
+    - topk_group: Optional[int]: additional parameter for grouped_topk
+    - use_grouped_topk: If True, use grouped_topk instead of fused_topk
+        note: Deepseekv2 model uses grouped_topk
+    - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - use_int8_w8a8 (bool): If True, use int8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - use_int8_w8a16 (bool): If True, use matmul of int8 weight and bf16/fp16
+        activation to compute the inner products for w1 and w2.
+        Defaults to False.
+    - use_int4_w4a16 (bool): If True, use matmul of int4 weight and bf16/fp16
+        activation to compute the inner products for w1 and w2.
+        Defaults to False.
+    - use_mxfp4_w4a4 (bool): If True, use matmul of OCP MXFP4 weight and
+        OCP MXFP4 activation to compute the inner products for w1 and w2.
+        Defaults to False.
+    - global_num_experts (int): The total number of experts in the global
+        expert space.
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices 
+        from the global expert space to the local expert space of the expert 
+        parallel shard.
+    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w1.
+    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w2.
+    - a1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        a1.
+    - a2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        a2.
+    - block_shape: (Optional[list[int]]): Optional block size for block-wise
+        quantization.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+
+    if use_grouped_topk:
+        assert num_expert_group is not None and topk_group is not None
+        topk_weights, topk_ids = grouped_topk(hidden_states, gating_output,
+                                              topk, renormalize,
+                                              num_expert_group, topk_group)
+    elif custom_routing_function is None:
+        topk_weights, topk_ids, token_expert_indices = fused_topk(
+            hidden_states, gating_output, topk, renormalize)
+    else:
+        topk_weights, topk_ids = custom_routing_function(
+            hidden_states, gating_output, topk, renormalize)
+
+    return fused_experts_impl(hidden_states,
+                         w1,
+                         w2,
+                         topk_weights,
+                         topk_ids,
+                         inplace=inplace,
+                         activation=activation,
+                         use_fp8_w8a8=use_fp8_w8a8,
+                         use_int8_w8a8=use_int8_w8a8,
+                         use_int8_w8a16=use_int8_w8a16,
+                         use_int4_w4a16=use_int4_w4a16,
+                         use_mxfp4_w4a4=use_mxfp4_w4a4,
+                         per_channel_quant=per_channel_quant,
+                         global_num_experts=global_num_experts,
+                         expert_map=expert_map,
+                         w1_scale=w1_scale,
+                         w2_scale=w2_scale,
+                         w1_zp=w1_zp,
+                         w2_zp=w2_zp,
+                         a1_scale=a1_scale,
+                         a2_scale=a2_scale,
+                         block_shape=block_shape)
+
+
+# class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
+# 
+#     def __init__(
+#         self,
+#         use_fp8_w8a8: bool = False,
+#         use_int8_w8a8: bool = False,
+#         use_int8_w8a16: bool = False,
+#         use_int4_w4a16: bool = False,
+#         use_mxfp4_w4a4: bool = False,
+#         per_act_token_quant: bool = False,
+#         block_shape: Optional[list[int]] = None,
+#     ):
+#         super().__init__(
+#             FusedMoEQuantConfig.make(
+#                 use_fp8_w8a8=use_fp8_w8a8,
+#                 use_int8_w8a8=use_int8_w8a8,
+#                 use_int8_w8a16=use_int8_w8a16,
+#                 use_int4_w4a16=use_int4_w4a16,
+#                 use_mxfp4_w4a4=use_mxfp4_w4a4,
+#                 per_act_token_quant=per_act_token_quant,
+#                 block_shape=block_shape,
+#             ))
+# 
+#         self.use_fp8_w8a8 = use_fp8_w8a8
+#         self.use_int4_w4a16 = use_int4_w4a16
+#         self.use_int8_w8a8 = use_int8_w8a8
+#         self.use_int8_w8a16 = use_int8_w8a16
+#         self.use_mxfp4_w4a4 = use_mxfp4_w4a4
+# 
+#     @property
+#     def activation_formats(
+#         self
+#     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+#         return (mk.FusedMoEActivationFormat.Standard,
+#                 mk.FusedMoEActivationFormat.Standard)
+# 
+#     def supports_chunking(self) -> bool:
+#         return True
+# 
+#     def supports_expert_map(self) -> bool:
+#         return True
+# 
+#     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+#         return TopKWeightAndReduceNoOP()
+# 
+#     def workspace_shapes(
+#         self,
+#         a: torch.Tensor,
+#         aq: torch.Tensor,
+#         M: int,
+#         N: int,
+#         K: int,
+#         topk: int,
+#         global_num_experts: int,
+#         local_num_experts: int,
+#         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+#     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+#         workspace1 = (M, topk, max(N // 2, K))
+#         workspace2 = (M, topk, max(N, K))
+#         output = (M, K)
+#         return (workspace1, workspace2, output, a.dtype)
+# 
+#     def apply(
+#         self,
+#         output: torch.Tensor,
+#         hidden_states: torch.Tensor,
+#         w1: torch.Tensor,
+#         w2: torch.Tensor,
+#         topk_weights: torch.Tensor,
+#         topk_ids: torch.Tensor,
+#         activation: str,
+#         global_num_experts: int,
+#         expert_map: Optional[torch.Tensor],
+#         w1_scale: Optional[torch.Tensor],
+#         w2_scale: Optional[torch.Tensor],
+#         w1_zp: Optional[torch.Tensor],
+#         w2_zp: Optional[torch.Tensor],
+#         a1q_scale: Optional[torch.Tensor],
+#         a2_scale: Optional[torch.Tensor],
+#         workspace13: torch.Tensor,
+#         workspace2: torch.Tensor,
+#         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+#         apply_router_weight_on_input: bool,
+#     ):
+#         # Check constraints.
+#         if self.use_int4_w4a16:
+#             assert hidden_states.size(-1) // 2 == w1.size(2), (
+#                 "Hidden size mismatch")
+#         else:
+#             assert hidden_states.size(-1) == w1.size(2), \
+#                 (f"Hidden size mismatch {hidden_states.size(-1)} "
+#                  f"!= {w1.size(2)}")
+# 
+#         assert hidden_states.is_contiguous(
+#         ), "Hidden_states must be contiguous"
+#         assert hidden_states.dim() == 2
+#         assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
+#         assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
+#         assert hidden_states.dtype in [
+#             torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn
+#         ]
+# 
+#         E, num_tokens, N, K, top_k_num = mk._moe_problem_size(
+#             hidden_states, w1, w2, topk_ids)
+# 
+#         if global_num_experts == -1:
+#             global_num_experts = E
+# 
+#         config_dtype = get_config_dtype_str(use_fp8_w8a8=self.use_fp8_w8a8,
+#                                             use_int8_w8a16=self.use_int8_w8a16,
+#                                             use_int4_w4a16=self.use_int4_w4a16,
+#                                             use_mxfp4_w4a4=self.use_mxfp4_w4a4,
+#                                             dtype=hidden_states.dtype)
+# 
+#         config = try_get_optimal_moe_config(
+#             w1.size(),
+#             w2.size(),
+#             top_k_num,
+#             config_dtype,
+#             num_tokens,
+#             block_shape=self.block_shape,
+#         )
+# 
+#         if hidden_states.dtype == torch.bfloat16:
+#             compute_type = tl.bfloat16
+#         elif hidden_states.dtype == torch.float16:
+#             compute_type = tl.float16
+#         elif hidden_states.dtype == torch.float32:
+#             compute_type = tl.float32
+#         elif hidden_states.dtype == torch.float8_e4m3fn:
+#             compute_type = tl.bfloat16
+#         else:
+#             raise ValueError(
+#                 f"Unsupported compute_type: {hidden_states.dtype}")
+# 
+#         # Note that the output tensor might be in workspace1
+#         intermediate_cache1 = _resize_cache(workspace2,
+#                                             (num_tokens, top_k_num, N))
+#         intermediate_cache2 = _resize_cache(workspace13,
+#                                             (num_tokens * top_k_num, N // 2))
+#         intermediate_cache3 = _resize_cache(workspace2,
+#                                             (num_tokens, top_k_num, K))
+# 
+#         sorted_token_ids, expert_ids, num_tokens_post_padded = (
+#             moe_align_block_size(topk_ids, config['BLOCK_SIZE_M'],
+#                                  global_num_experts, expert_map))
+# 
+#         invoke_fused_moe_kernel(
+#             hidden_states,
+#             w1,
+#             intermediate_cache1,
+#             a1q_scale,
+#             w1_scale,
+#             w1_zp,
+#             None,  # topk_weights
+#             sorted_token_ids,
+#             expert_ids,
+#             num_tokens_post_padded,
+#             False,  # mul_routed_weights
+#             top_k_num,
+#             config,
+#             compute_type=compute_type,
+#             use_fp8_w8a8=self.use_fp8_w8a8,
+#             use_int8_w8a8=self.use_int8_w8a8,
+#             use_int8_w8a16=self.use_int8_w8a16,
+#             use_int4_w4a16=self.use_int4_w4a16,
+#             per_channel_quant=self.per_act_token_quant,
+#             block_shape=self.block_shape)
+# 
+#         self.activation(activation, intermediate_cache2,
+#                         intermediate_cache1.view(-1, N))
+# 
+#         a2q_scale: Optional[torch.Tensor] = None
+# 
+#         qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
+#             intermediate_cache2, a2_scale, self.quant_dtype,
+#             self.per_act_token_quant, self.block_shape)
+# 
+#         invoke_fused_moe_kernel(qintermediate_cache2,
+#                                 w2,
+#                                 intermediate_cache3,
+#                                 a2q_scale,
+#                                 w2_scale,
+#                                 w2_zp,
+#                                 topk_weights,
+#                                 sorted_token_ids,
+#                                 expert_ids,
+#                                 num_tokens_post_padded,
+#                                 not apply_router_weight_on_input,
+#                                 1,
+#                                 config,
+#                                 compute_type=compute_type,
+#                                 use_fp8_w8a8=self.use_fp8_w8a8,
+#                                 use_int8_w8a8=self.use_int8_w8a8,
+#                                 use_int8_w8a16=self.use_int8_w8a16,
+#                                 use_int4_w4a16=self.use_int4_w4a16,
+#                                 per_channel_quant=self.per_act_token_quant,
+#                                 block_shape=self.block_shape)
+# 
+#         ops.moe_sum(intermediate_cache3, output)
+# 
+
+# def modular_triton_fused_moe(
+#     use_fp8_w8a8: bool,
+#     use_int8_w8a8: bool,
+#     use_int8_w8a16: bool,
+#     use_int4_w4a16: bool,
+#     use_mxfp4_w4a4: bool,
+#     per_act_token_quant: bool,
+#     block_shape: Optional[list[int]] = None,
+# ) -> mk.FusedMoEModularKernel:
+#     return mk.FusedMoEModularKernel(
+#         MoEPrepareAndFinalizeNoEP(),
+#         TritonExperts(
+#             use_fp8_w8a8=use_fp8_w8a8,
+#             use_int8_w8a8=use_int8_w8a8,
+#             use_int8_w8a16=use_int8_w8a16,
+#             use_int4_w4a16=use_int4_w4a16,
+#             use_mxfp4_w4a4=use_mxfp4_w4a4,
+#             per_act_token_quant=per_act_token_quant,
+#             block_shape=block_shape,
+#         ),
+#     )
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index c1799b268..8293e6b65 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -118,6 +118,13 @@ class BatchComposition(Enum):
 STATE_N_GROUPS = [1]
 HAS_INITIAL_STATE = [True]
 
+MOE_NUM_EXPERTS = [8]
+MOE_N = [14336]  # intermediate size of mixtral-8x7b
+MOE_K = [4096]  # for mixtral-8x7b
+TP_FACTOR = [1, 2]
+MOE_TOP_K = [2]  # for mixtral-8x7b, mixtral-8x22b
+
+
 IMPLEMENTATION_UT = [
     Implementation.TRITON_2D,
     Implementation.TRITON_3D,
@@ -167,6 +174,12 @@ class BatchComposition(Enum):
     "MAX_VALUES",
     "STATE_DIM",
     "STATE_N_GROUPS",
+    "HAS_INITIAL_STATE",
+    "MOE_NUM_EXPERTS",
+    "MOE_N",
+    "MOE_K",
+    "TP_FACTOR",
+    "MOE_TOP_K",
 ]
 # "BENCHMARK_MODES", "IMPLEMENTATION_UT" ]
 debug_env_vars = [
@@ -187,8 +200,12 @@ class BatchComposition(Enum):
         import json
 
         envfile_path = os.path.abspath(envfile_name)
-        print(f"\nApplied test config: {envfile_path}")
+        if not os.path.isfile(envfile_path):
+            raise RuntimeError(f"Test config file {envfile_path} does not exist.")
         env_setting = dotenv_values(envfile_path)
+        if len(env_setting) == 0:
+            raise RuntimeError(f"Test config file {envfile_path} does not contain valid configs.")
+        print(f"\nApplied test config: {envfile_path}")
         # filter allowed, convert all to lists
         env_setting_filtered = {
             k: json.loads(env_setting[k]) for k in test_setup_vars if k in env_setting
@@ -1705,6 +1722,218 @@ def generate_dummy_data(batch_size):
         raise e
 
 
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seqlen", SEQUENCE_LENGTHS)
+@pytest.mark.parametrize("n", MOE_N)
+@pytest.mark.parametrize("k", MOE_K)
+@pytest.mark.parametrize("e", MOE_NUM_EXPERTS)
+@pytest.mark.parametrize("tp", TP_FACTOR)
+@pytest.mark.parametrize("topk", MOE_TOP_K)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("max_value", MAX_VALUES)
+# @pytest.mark.parametrize("implementation", IMPLEMENTATION_UT)
+@pytest.mark.parametrize("benchmark_mode", BENCHMARK_MODES)
+def test_fused_moe(
+    capsys,
+    request,
+    batch_size,
+    seqlen,
+    n: int,
+    k: int,
+    e: int,
+    tp: int,
+    topk: int,
+    dtype: torch.dtype,
+    seed,
+    max_value,
+    # implementation,
+    benchmark_mode,
+):
+    # based on: https://github.com/vllm-project/vllm/blob/main/tests/kernels/test_moe.py
+    from vllm.model_executor.layers.activation import SiluAndMul
+
+    my_id = request.node.nodeid.split("::")[-1]
+    my_name = my_id.split("[")[0]
+    my_instance = my_id.split("[")[1][:-1]
+   
+    def torch_moe(a, w1, w2, score, topk):
+        B, D = a.shape
+        a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+        out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+        score = torch.softmax(score, dim=-1, dtype=torch.float32)
+        topk_weight, topk_ids = torch.topk(score, topk)
+        topk_weight = topk_weight.view(-1)
+        topk_ids = topk_ids.view(-1)
+        for i in range(w1.shape[0]):
+            mask = topk_ids == i
+            if mask.sum():
+                out[mask] = SiluAndMul()(
+                    a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
+        return (out.view(B, -1, w2.shape[1]) *
+                topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+    from ibm_triton_lib.kernels import fused_moe
+    # my_experts = TritonExperts()
+    # fused_moe = my_experts.apply
+
+    torch.manual_seed(seed)
+    tdev = torch.device(device)
+    torch.cuda.set_device(tdev)
+    # m = batch_size * seqlen
+    num_tokens = batch_size * seqlen
+    m = num_tokens
+    n  = int(n//tp)
+    
+    # ATOL = 1e-2
+    # TODO
+    ATOL = max(1e-2, 2 * max_value)
+    RTOL = 0
+
+    a = None
+    w1 = None
+    w2 = None
+    score = None
+    torch_output = None
+    triton_output = None
+
+    inner_exception = None 
+    try: 
+
+        a = torch.randn((m, k), device=tdev, dtype=dtype).normal_(mean=0.0, std=0.5 * max_value)
+        # w1 = torch.randn((e, n, k), device=tdev, dtype=dtype).normal_(mean=0.0, std=0.5 * max_value)
+        # w2 = torch.randn((e, k, n//2), device=tdev, dtype=dtype).normal_(mean=0.0, std=0.5 * max_value)
+        w1 = torch.randn((e, 2 * n, k), device=tdev, dtype=dtype).normal_(mean=0.0, std=0.5 * max_value)
+        w2 = torch.randn((e, k, n), device=tdev, dtype=dtype).normal_(mean=0.0, std=0.5 * max_value)
+        score = torch.randn((m, e), device=tdev, dtype=dtype)
+    
+        input_gating = torch.empty(num_tokens, e, dtype=torch.float32, device=tdev)
+
+        if enforce_numerical_correctness:
+            torch_output = torch_moe(a, w1, w2, score, topk)
+            assert torch_output is not None
+        """
+        from fused_moe.py
+         Key Parameters:
+            - A: The input tensor representing tokens with shape (*, K), where '*' can
+                be any shape representing batches and K is the feature dimension of
+                each token.
+            - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+                the number of experts, K is the input feature dimension, and N is
+                the output feature dimension.
+            - C: The output cache tensor with shape (M, topk, N), where M is the
+                total number of tokens post padding, topk is the number of times
+                each token is repeated, and N is the output feature dimension.
+        """
+
+        # TODO: renormalize? 
+        triton_output = fused_moe(a, w1, w2, input_gating, topk,
+                                  renormalize=True) #inplace=True ? 
+        assert triton_output is not None
+        
+        captured = ''
+        if capsys is not None:
+            captured_raw = capsys.readouterr()  # returns stdout, stderr
+            for l in captured_raw:
+                if len(l) > 0:
+                    # captured += l  # + '|'
+                    captured += l  + ' '
+        
+        # compare
+        allclose_pass = float('nan')
+        if enforce_numerical_correctness:
+            triton.testing.assert_close(torch_output, triton_output, atol=ATOL, rtol=RTOL)
+            allclose_pass = True
+
+        call_func_under_test = lambda: fused_moe(a, w1, w2, input_gating, topk, 
+                                                 renormalize=True, inplace=True)
+
+        # benchmark only correct results
+        if do_benchmarks:
+            if my_name not in pytest.global_pds:
+                pytest.global_pds[my_name] = pd.DataFrame()
+
+            # equals to defaults
+            warmup_rep = 25
+            bench_rep = 100
+            ms, min_ms, max_ms = measure_benchmarks(
+                benchmark_mode, call_func_under_test, warmup_rep, bench_rep
+            )
+
+            record = {
+                "batch_size": batch_size,
+                "seqlen": seqlen,
+                "num_tokens": num_tokens, # redundant?
+                "N": n,
+                "K": k,
+                "E": e,
+                "TP": tp,
+                "topk": topk,
+                "max_value": max_value,
+                "dtype": dtype,
+                # "implementation": implementation,
+                "ms": ms,
+                "min_ms": min_ms,
+                "max_ms": max_ms,
+                "benchmark_mode": benchmark_mode,
+                "allclose_pass": allclose_pass,
+                "ATOL": ATOL,
+                "RTOL": RTOL,
+                # "proton_count": proton_count,
+                # "proton_ns": proton_ns,
+                # "proton_util_compute": proton_util_compute,
+                # "proton_util_bw": proton_util_bw,
+                "captured": captured,
+            }
+
+            if add_triton_dejavu_envs:
+                dejavu_envs = {}
+                _skip_dejavu_envs = [
+                    "_TRITON_DEJAVU_DETERMINED_CUDA_VERSION",
+                    "DEBUG",
+                    "STORAGE",
+                ]
+                for env in os.environ.keys():
+                    if "TRITON_DEJAVU_" in env:
+                        if any([skip_s in env for skip_s in _skip_dejavu_envs]):
+                            continue
+                        dejavu_envs[env] = os.environ[env]
+                record.update(dejavu_envs)
+
+            pytest.global_pds[my_name] = pd.concat(
+                [pytest.global_pds[my_name], pd.Series(record).to_frame().T]
+            ).reset_index(drop=True)
+
+            if pytest.global_pd_file_prefix is not None:
+                filename = os.path.abspath(
+                    f"{pytest.global_pd_file_prefix}/{my_name}.csv"
+                )
+                write_df_and_chmod(pytest.global_pds[my_name], filename)
+
+    except Exception as e:
+        print(e)
+        inner_exception = e
+    finally:
+        # cleanup memory
+        try:
+            del a
+            del w1
+            del w2
+            del score
+            del triton_output
+            del torch_output
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+        except Exception as e:
+            print(e)
+            # pass
+        finally:
+            if inner_exception is not None:
+                raise inner_exception
+
+
+
+
 def measure_benchmarks(
     benchmark_mode, call_func_under_test, warmup_rep=25, bench_rep=100
 ):
diff --git a/scripts/setups/granite4_moe_0.conf b/scripts/setups/granite4_moe_0.conf
new file mode 100644
index 000000000..1103ff8d6
--- /dev/null
+++ b/scripts/setups/granite4_moe_0.conf
@@ -0,0 +1,20 @@
+BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64, 128]
+# BATCH_SIZES = [4]
+SEQUENCE_LENGTHS = [16, 32, 64, 128, 512, 1024, 2048, 4096]
+
+MOE_N = [768] # intermediate size
+MOE_K = [4096] # hidden size
+MOE_TOP_K = [10] # num_experts_per_tok
+TP_FACTOR = [1, 2]
+# DTYPES = ["bfloat16"]
+DTYPES = ["float16"]
+
+BENCHMARK_MODES = ["CUDA_EVENTS"]
+# BENCHMARK_MODES = ["CUDA_GRAPS"]
+
+IMPLEMENTATION_UT = ["BASELINE_TRITON"]  # some value for now
+
+# TRITON_BACKEND_DEBUG = 1
+# STORE_TEST_RESULT_PATH=/results
+
+TEST_ALLOW_INCORRECT = 1

From dbddbb19d59614a6535abf0ca8dae230143edca2 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Fri, 18 Jul 2025 21:33:45 +0000
Subject: [PATCH 09/61] first tuning result

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 ...384,device_name=NVIDIA_H100_80GB_HBM3.json |   146 +
 tune_log_g4small.txt                          | 14544 ++++++++++++++++
 2 files changed, 14690 insertions(+)
 create mode 100644 E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
 create mode 100644 tune_log_g4small.txt

diff --git a/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json b/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..a7cfd175d
--- /dev/null
+++ b/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/tune_log_g4small.txt b/tune_log_g4small.txt
new file mode 100644
index 000000000..e87bf0ece
--- /dev/null
+++ b/tune_log_g4small.txt
@@ -0,0 +1,14544 @@
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.21952056884766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.08415967226028, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 95.1449602842331, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.96367955207825, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.01135909557343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.8116797208786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.98880136013031, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.25296032428741, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.80431878566742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.81823897361755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.77791965007782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.6235209107399, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.30752062797545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.14223992824554, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.07647919654846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.75008189678192, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.48751831054688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.90320003032684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.32399988174438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.56895941495895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.95551943778992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.86191987991333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.41519868373871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.11391997337341, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.62847971916199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.37600016593933, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.97728019952774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.66848009824753, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.06767928600313, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.1966392993927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.42016017436981, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.24863958358765, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.62255942821503, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.56479978561401, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.26896071434021, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.28272032737732, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.04415988922119, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.0792008638382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.50959920883179, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.26288032531738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.2688010931015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.45119988918304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.7017593383789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.7870409488678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.5108813047409, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.76943969726562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.97296071052551, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.18736004829407, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.30688011646271, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.32127964496613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.01535964012146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.80000078678131, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.95936036109924, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.87296104431152, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.53248119354248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.3331196308136, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.66159987449646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.10959911346436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.59728074073792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.66848075389862, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.40544068813324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.71199989318848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.05631959438324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.78655993938446, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.29023921489716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.43103921413422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.36160099506378, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.17599940299988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.5961594581604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.2511990070343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.91151797771454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.92255985736847, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.28623974323273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.46527969837189, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.73632049560547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.31391978263855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.36176180839539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.91471946239471, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.6344004869461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.98496055603027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.47920024394989, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.38431930541992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.95120060443878, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.2299201488495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.37504053115845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.74495947360992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.05023980140686, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 140.11984050273895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.35696125030518, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.0168000459671, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.5961595773697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.19232034683228, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.04480040073395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.37583923339844, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.5473598241806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.8715192079544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.84543907642365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.47103989124298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.00479996204376, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.70496046543121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.76848137378693, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.4023984670639, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.50992047786713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.8615991473198, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.36511981487274, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.80255913734436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.90768045186996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.87664031982422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.71503937244415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.07711946964264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.30224001407623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.21104001998901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.1295998096466, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.3929613828659, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.92687910795212, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.64911925792694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.03391945362091, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.43920040130615, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.37056005001068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.80703997612, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.48832023143768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.26639986038208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.1515206694603, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.96400046348572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.19855999946594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.8088002204895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.56783890724182, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.04335874319077, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.51872050762177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.60496008396149, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.72272074222565, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.21567976474762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.23936033248901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.17279994487762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.64064049720764, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.37472021579742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.7777590751648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.35952007770538, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.8687995672226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.75968039035797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.43968081474304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.30911993980408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.02687954902649, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.53423988819122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.84975969791412, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.03359889984131, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.55631983280182, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.79584074020386, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.93856072425842, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.30256044864655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.96799790859222, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.97087967395782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.3231999874115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.31471943855286, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.96576023101807, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.367520570755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.65295994281769, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.47903847694397, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.14735960960388, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.33376026153564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.4876799583435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.05407989025116, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.07632029056549, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.60560071468353, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.70032131671906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.22111988067627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.88960099220276, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.00464046001434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.16095888614655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.5769602060318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.2671993970871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.15855932235718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.54255974292755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.0723204612732, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.99807965755463, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.6131204366684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.03247916698456, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.36495912075043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.80544030666351, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.19104027748108, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 131.7660790681839, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.06976091861725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.76255965232849, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.42143988609314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.3687995672226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.9324803352356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.37872052192688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.53839981555939, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.99808025360107, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.91008031368256, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.22000086307526, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.40383970737457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.48848032951355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.6155207157135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.22688007354736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.99743902683258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.76015901565552, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.34768152236938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.54607999324799, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.20000076293945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.48479974269867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.52512013912201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.23184096813202, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.87504029273987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.6572802066803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.28063881397247, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.78575921058655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.30751979351044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.29695904254913, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.46479988098145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.60704064369202, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.41152131557465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.95151948928833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.23456013202667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.39776062965393, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.10367977619171, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.68639945983887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.25311958789825, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.8051209449768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.3476802110672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.16704058647156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.41008043289185, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.75903964042664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.16175937652588, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.17488014698029, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.29679989814758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.19839978218079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.41855978965759, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.36767852306366, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.24863910675049, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.33199977874756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.05583798885345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.09647822380066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.77983915805817, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.7124798297882, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.70815801620483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.71151959896088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.72240233421326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.77952075004578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.56304037570953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.02832114696503, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.07695853710175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.40479922294617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.9411200284958, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.3185601234436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.49311912059784, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.10111904144287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.27872025966644, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.73104083538055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.63423943519592, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.74831986427307, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.85120010375977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.6529585123062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.87776100635529, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.250559091568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.06048035621643, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.62160098552704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.8033584356308, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.68688011169434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.04672050476074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.29199922084808, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.9167994260788, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.55887842178345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.2655987739563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.06879901885986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.35344219207764, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.46080029010773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.7460800409317, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.9990395307541, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.17199909687042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.28848087787628, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.12816047668457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.47775924205781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.46000039577484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.22399914264679, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.9752002954483, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.80512034893034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.72831928730011, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.7443196773529, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.141921043396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.5223995447159, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.50720012187958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.36864054203033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.70255970954895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.24624001979828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.74352025985718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.95888149738312, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.94751930236816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.72047889232635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.25104022026062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.62576007843018, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.32144069671631, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.92720019817352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.7012814283371, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.63279938697815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.06560027599335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.5304002761841, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.85968053340912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.41184139251709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.9236798286438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.36591958999634, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.86223900318146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.77183973789215, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.62944054603577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.11712086200714, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.81632018089294, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.0520007610321, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.4864000082016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.69871914386749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.40607988834381, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.2665604352951, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.5385603904724, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.19055914878845, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.74703991413116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.71455979347229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.6947191953659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.65887999534607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.05568087100983, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.01648020744324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.14928007125856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.02896082401276, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.32528102397919, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.6811203956604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.23071992397308, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.77631902694702, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.06111943721771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.93888032436371, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.60496175289154, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.0680000782013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.84319949150085, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.4415991306305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.13472127914429, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.74336063861847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.73904037475586, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.28559994697571, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.27183973789217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.1975998878479, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.65455877780914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 149.7278380393982, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.15903985500336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.024799823761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.91360116004944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.57583999633789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.71583950519562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 149.4928002357483, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.28992092609406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.74128079414368, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.632958650589, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.46528112888336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.60336017608643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 150.0075203180313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.48351860046387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.64655888080597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.55056059360504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.67999970912933, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.87264001369476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 150.05552113056183, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.58527982234955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.86480045318604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.49600088596344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.97728097438812, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.11856126785278, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.73776030540466, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.58143985271454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.38784039020538, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.42559957504272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.33104085922241, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.37104201316833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.50160014629364, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.28943920135498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.50591969490051, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.0841612815857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 134.6785604953766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.56159949302673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.84159994125366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.31999981403351, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.83199942111969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.42111897468567, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.46144092082977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.34768056869507, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 134.23024117946625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.45343911647797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.84159934520721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.46431994438171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.75375938415529, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.53247892856598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.52127921581268, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.80144000053406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 134.35039937496185, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.07647931575775, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.87664031982422, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.11376023292542, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.85967814922333, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.84575998783112, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.45504021644592, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.59488093852997, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.74128103256226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.36368036270142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.89103972911835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.68336057662964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.91631984710693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.44480013847351, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.45135998725891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.40288043022156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.58799934387207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.28895998001099, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.3340802192688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.92304134368896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.30864036083221, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.1041601896286, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.21472001075745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.94800066947937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.44304025173187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.37984156608582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.39584136009216, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.21984088420868, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.46896076202393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.42127895355225, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.31056010723114, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.45711827278137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.57376039028168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.37167930603027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.409921169281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.98415994644165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.2455998659134, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.7712004184723, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.44863891601562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.43071913719177, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.6139190196991, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.42127883434296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.65392065048218, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.86176061630249, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.6020803451538, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.33040022850037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.65999972820283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.97120010852814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.95343935489655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.95071983337402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.15903961658478, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.68064057826996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.98351907730103, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.36175954341888, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.51632022857666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.44208121299744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 144.69775915145874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.07408046722412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.52512013912201, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.69024002552032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.5959997177124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.2868800163269, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.76304030418396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.39680075645447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.35983788967133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.19343960285187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.71935939788818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.7367992401123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.60032165050507, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.35871911048889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.34912085533142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.36272060871124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.61008059978485, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.5107192993164, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.61135995388031, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.15823984146118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.62480127811432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.68415987491608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.47103977203369, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.61184000968933, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.23424017429352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.47151911258698, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.7670407295227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.01759910583496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.87151968479156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 117.00736105442047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.4523184299469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.98016059398651, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.85216009616852, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.68816030025482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.799840092659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.98751997947693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.63568019866943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.80623948574066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.47712087631226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.29728031158447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.5996813774109, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.76976084709167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.77055990695953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.9096006155014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.09008026123047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.53312063217163, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.26447820663452, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.49983859062195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.35967993736267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.06191968917847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.03199982643127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.27120053768158, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.6116794347763, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.5921607017517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.65600085258484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.43024051189423, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.62656140327454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.24607956409453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.0454398393631, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.67343997955322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.01455950737, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.62752044200897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.32335925102234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.9548796415329, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.6206395626068, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.89376056194305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.85776019096375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.4478394985199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.97279894351959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.38175892829895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.45024061203003, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.90512001514435, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.17328011989594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.41440045833588, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.26928174495697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.43840062618256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.7868790626526, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.65120136737823, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.12335777282715, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.25151884555817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 226.0313606262207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.28895950317383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 226.62800073623657, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.0836799144745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.52304005622864, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.67568051815033, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.33920097351074, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.82191979885101, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.00879991054535, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.1534389257431, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.25584101676941, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.10847973823547, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.828320145607, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.13487887382507, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.21135914325714, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.77072060108185, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.1289598941803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.72975957393646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.0329601764679, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.58287966251372, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.964799284935, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.12879979610443, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.29088127613068, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.74480032920837, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.1448016166687, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.63503885269165, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.68143939971924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.30496048927306, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.66271877288818, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.46016025543213, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.95600152015686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.1267204284668, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.16656005382538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.56544029712677, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.83792006969452, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.81999886035919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.80527949333191, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.81472074985504, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.24527931213377, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.70640063285828, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.6512006521225, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.0147204399109, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.72767961025238, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.4935985803604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.47487878799437, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.1350394487381, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.74000024795532, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.30271935462952, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.80384063720703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.26336252689362, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.5336000919342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.71200048923492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.774240732193, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.1995198726654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.43903994560242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.79679882526398, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.79695963859558, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.90367949008942, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.72384083271027, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.81775867938995, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.1662393808365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.96143901348114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.18992066383362, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.4731193780899, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.2235198020935, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.8363196849823, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.0247997045517, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.85967993736267, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.47679746150969, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.01503896713257, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.11951959133148, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.44816052913666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.84495902061462, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.42847967147827, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.0433599948883, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.43791949748993, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.99712014198303, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.29888021945953, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.53103935718536, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.45040047168732, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.70480108261108, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.9902400970459, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.6401596069336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.67295944690704, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.02431833744049, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.87152111530304, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.74975979328156, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.56224083900452, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.25664114952087, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.51568126678467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.98176038265228, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.36960124969482, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.4332801103592, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.68464028835297, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.0854400396347, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.6854418516159, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.23440027236938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.72208142280579, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.92464065551758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.55232048034668, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.52096033096313, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.72879981994629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.4087997674942, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.4590392112732, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.65296113491058, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.88736009597778, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.3200011253357, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.17872095108032, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.52735924720764, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.9382404088974, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.43007957935333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.02559900283813, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.87903976440428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.18287992477417, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.04304099082947, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.41071951389313, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.43984043598175, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.95152008533478, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.26992058753967, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 140.4124802350998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.09360003471375, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.29375958442688, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.38719940185547, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.85760045051575, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.08112049102783, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 140.84991931915283, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.27647960186005, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.60351896286011, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.8652799129486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.21391940116882, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.69903922080994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 140.53727984428406, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.84816062450409, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.4835205078125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.46832013130188, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.39151978492737, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.90143871307373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 140.96384048461914, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.27280080318451, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.50783979892731, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.46175932884216, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 142.08927989006042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.96495938301086, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 142.3358392715454, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.09039855003357, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 142.34272003173828, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.99744033813477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 142.0116800069809, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.1139212846756, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.59056103229523, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.37391996383667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.40623903274536, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.6054402589798, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.82912003993988, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.2590389251709, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.4771190881729, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.97791957855225, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.39360105991364, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.63711893558502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.01119816303253, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.52016067504883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.27679932117462, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.75552117824554, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.8719997406006, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.73103952407837, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.143679022789, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.49343955516815, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.17631912231445, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.16864049434662, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.76463949680328, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 144.22991931438446, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.81103885173798, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.08687925338745, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.0835200548172, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.50224137306213, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.12000048160553, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.63855934143066, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.64975929260254, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.96927952766418, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.81663870811462, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.7153605222702, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.07488214969635, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.67088079452515, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.09632074832916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.2899204492569, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.77568006515503, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 144.21407878398895, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.98543965816498, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.23311924934387, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.79552125930786, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.00015926361084, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.90256083011627, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.06847989559174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.55519950389862, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.89967954158783, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.53759944438934, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.66671872138977, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.73455953598022, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 117.17920005321503, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.48495995998383, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.64719927310944, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.82831990718842, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.00640082359314, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.74896001815796, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 117.17712044715881, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.51216053962708, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.34527909755707, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.10383975505829, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.12752103805542, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.36672019958496, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.50336027145386, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.86367869377136, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.23535978794098, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.32480013370514, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.21215963363647, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.35007977485657, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.5939199924469, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.432159781456, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.19151997566223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.4444808959961, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.4072003364563, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 142.2188800573349, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.96032106876373, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.51071846485138, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.96464002132416, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.30400002002716, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.24848008155823, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.32464051246643, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.78847980499268, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.49935948848724, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.16751968860626, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.9262398481369, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.87023985385895, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.42592060565948, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.36464059352875, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.4867205619812, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.83535861968994, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.40655982494354, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.41471982002258, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.31855976581573, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.9025604724884, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.37103962898254, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.44303977489471, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.0249594449997, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.8339204788208, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 215.62079906463623, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 957.859525680542, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 215.8238399028778, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 957.7103996276855, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 216.74816131591797, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.9918441772461, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 216.19743824005127, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 310.342880487442, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 220.659362077713, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 311.50832176208496, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 224.53823924064636, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 308.7073600292206, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 220.53872108459473, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 311.4625608921051, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 224.4108808040619, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 309.74640011787415, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 219.55552220344543, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 310.98495960235596, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 223.21711897850037, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 308.4230399131775, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 219.64751839637756, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 311.1513590812683, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 224.38512086868286, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 313.44271898269653, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.63088023662567, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.7019190788269, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.55679941177368, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.4072003364563, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 200.57920217514038, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.54207849502563, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 200.35167932510376, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1742.3779296875, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.2067174911499, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1744.2910289764404, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.79375982284546, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1742.711524963379, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.9486424922943, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1744.2323207855225, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.87616205215454, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.01952052116394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.62031960487366, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.48032069206238, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.61920046806335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 292.09808349609375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 280.1147210597992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 284.5358383655548, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.73935866355896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.8942415714264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.5022382736206, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.53183913230896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.71408128738403, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.57855796813965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 281.1942434310913, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.89647793769836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.0896019935608, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.91872119903564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.92143750190735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.91775965690613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.46143865585327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 293.72976183891296, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 293.0740785598755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 281.610723733902, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.45647621154785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.82255816459656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.9921579360962, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.35407876968384, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.0457639694214, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.85599875450134, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.91423869132996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.71984219551086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.5283179283142, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.64704275131226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.27807807922363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.508159160614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.9947168827057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.11983919143677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.1719994544983, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.00688219070432, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.21440148353577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.6347198486328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.54880023002625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.0759983062744, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.36015939712524, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.635196685791, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.87407779693604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.4654381275177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.40352058410645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.22224044799805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.44975972175598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.95392274856567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.35312247276306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.2020790576935, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.5236814022064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.7295968532562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.6262412071228, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.78448009490967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.9145622253418, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.6259183883667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.08495998382568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.9737591743469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.7124786376953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.457279920578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.2793595790863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.83663868904114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.13760018348694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 349.6822392940521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.5526382923126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 297.20735907554626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 286.7516803741455, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.6828806400299, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.03728008270264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.69167852401733, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.7803204059601, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 340.13248085975647, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.70608043670654, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 306.3313591480255, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 283.1761598587036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.5363199710846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.9785556793213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.56560254096985, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.64272141456604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.59135913848877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.4353623390198, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 299.56159949302673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.5857594013214, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.70063948631287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.99152159690857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.5889608860016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.45600056648254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.4415969848633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.3417594432831, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.79967880249023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 295.44464111328125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.6841607093811, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.87120366096497, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.46335887908936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.45728015899658, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.69248151779175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.59776306152344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.20623898506165, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.37439966201782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.82351922988892, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.10031986236572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.88816046714783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.73376178741455, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.47247910499573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.9795217514038, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.8196816444397, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.325279712677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.05903935432434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.71856093406677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.4265582561493, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.63824009895325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.48495948314667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.89199948310852, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.21136140823364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.273921251297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.92447924613953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.62288284301758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.49440050125122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.66384196281433, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.24335932731628, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.32975792884827, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.1110372543335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.0479965209961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.40528202056885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.74479913711548, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.82207894325256, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.98671865463257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.4832007884979, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.5790386199951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.31536173820496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.48175740242004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.7710394859314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.99440026283264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.50207924842834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.41551899909973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.1222424507141, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.58799982070923, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.72640132904053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.18607878684998, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.2105596065521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.2393605709076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.97424244880676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.96575951576233, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.4475221633911, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.58624362945557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.51439809799194, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.0265612602234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.7398383617401, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.18512082099915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.16031908988953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.16160035133362, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.67487788200376, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.6921582221985, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.73792147636414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.2716784477234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.1812801361084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.38911938667297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.5995206832886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 307.0958375930786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.876318693161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.8156816959381, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.9065592288971, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 319.15056228637695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.56784033775332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.78559970855713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.91088247299194, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 303.17375779151917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.9083209037781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.3523201942444, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 269.7886383533478, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 313.21776032447815, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.73791790008542, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 281.6812777519226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.67519807815552, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.09056115150452, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.99008011817932, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.23855805397034, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.23551988601685, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.90400099754333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.79983925819397, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.69999885559082, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.42991876602173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.66223859786987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.32511854171753, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.9667203426361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.7841601371765, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.87407875061035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.08240222930908, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.35088181495667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.58880019187927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.51728177070618, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.64719653129578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.23487997055054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.01328110694885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.54559922218323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.3521602153778, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.5411217212677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.48640036582947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.79248213768005, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.17456150054932, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.45440101623535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.31743693351746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.96320271492004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.69440007209778, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.26880073547363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.08368062973022, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.31856060028076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.72479915618896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.98784041404724, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.70687818527222, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.71871900558472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.0591995716095, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.6526393890381, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.16304254531863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.5201587677002, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.5073606967926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.42799925804138, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.10608005523682, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.4976007938385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.79184079170227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.6611201763153, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.9363214969635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.62848114967346, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.9859209060669, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.1092803478241, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.82399916648865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.89615988731384, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.1480007171631, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.896320104599, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.4648003578186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.84223985671997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.4737572669983, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 284.8292803764343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.12367725372314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.5462396144867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.26096200942993, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.5062370300293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.85535717010498, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.52207827568054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.00464034080505, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.47824025154114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.8764772415161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.53631949424744, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.367680311203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.0223984718323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.33823943138123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.268159866333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.0889618396759, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.8067181110382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.81071734428406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.4396777153015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.3075180053711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.84320092201233, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.187358379364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.13311767578122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.80271863937375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.39583897590637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.08655858039856, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.8209617137909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.90256071090698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.5795180797577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.66607880592346, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.0385603904724, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.0435209274292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.8940799236298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.3729588985443, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.29280114173892, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.6579215526581, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.80336093902588, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.41727948188782, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.78176021575928, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.95904159545898, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.37631821632385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.4534409046173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.32879877090454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.8732810020447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.06496000289917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.02096128463745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.1769597530365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.95407819747925, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.50239872932434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.561119556427, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.6796782016754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.4238407611847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.3622419834137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.67791867256165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.03183722496033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.99663829803467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.4945592880249, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.8449604511261, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 287.86896109580994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.31184339523315, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.7171187400818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.4345588684082, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.65344047546387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.42144060134885, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.26943850517276, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.95248079299927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 280.48160314559937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.8073606491089, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.88479804992676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.863840341568, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.07840180397034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.32480001449585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 318.296320438385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 281.1646378040314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.11536145210266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.92464232444763, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 333.23952078819275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.41232204437256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.2470405101776, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.9382390975952, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 319.0112018585205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 284.5129609107971, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.53664088249207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.92224311828613, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 338.9891195297241, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.2369611263275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.85183858871463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 269.7323191165924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 325.809121131897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 284.0215992927551, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.96768021583557, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.17887806892395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 333.4542381763458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.36319851875305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.27279806137088, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.1724796295166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 318.25583934783936, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.9756796360016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.6875183582306, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.04703974723816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 324.6737587451935, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 319.8260819911957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.1152012348175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 419.54336047172546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 307.84255862236023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.5800006389618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.23616003990173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 308.19984197616577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 400.4302382469177, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 415.4870367050171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 315.3063988685608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.09984135627747, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.7343990802765, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 315.063841342926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.21135926246643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.1636850833893, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 321.2718403339386, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.9774408340454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.6552016735077, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 306.02863907814026, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 392.4412775039673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.19839882850647, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 300.17647981643677, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.0225579738617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.6119978427887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.1150405406952, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.2388801574707, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.29615950584412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.89968276023865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.54175877571106, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.54176235198975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.50863981246948, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.8007991313934, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.05983996391296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.47103881835938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.85103750228882, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.82559943199158, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.84928178787231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.43408226966858, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.93296027183533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.54960083961487, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.7364799976349, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.97456169128418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.04367876052856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.67360281944275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.10319852828977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.76943969726562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.44080185890198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.47872114181519, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.02239799499512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.66575860977173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.80672144889832, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.90288066864014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.3140833377838, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.33375883102417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.84720134735107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.96367835998535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.1340811252594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.88079833984375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.28271985054016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.7140805721283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.33247923851013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.88559937477112, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.92959690093994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.3996813297272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.0416009426117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.35360145568848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.64287972450256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 283.1444811820984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.221120595932, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.9395182132721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.56591749191284, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.5841603279114, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.94032192230225, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.5580816268921, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.8782386779785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.3051176071167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.3062379360199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.2075209617615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.66384172439575, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 279.14639949798584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.3352026939392, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.99887919425964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.5262405872345, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.946560382843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.3987205028534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.03615975379944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 269.5625603199005, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 269.2307209968567, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 299.5020806789398, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.6351993083954, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 334.1220808029175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 311.4311981201172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 306.94656133651733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.89888215065, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 332.91375756263733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 318.68255972862244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 305.2179217338562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.6174385547638, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 324.68144059181213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 307.50240087509155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 302.46543765068054, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.1768000125885, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 326.7999994754791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 313.32687854766846, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.9155192375183, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.112318277359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.65424132347107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.95135807991028, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.82335948944092, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.8308789730072, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.21503901481628, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.98160338401794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.93824172019958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.4609603881836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.35536074638367, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.9515199661255, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.82656073570254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.333758354187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.3955225944519, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.16192078590393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.4726424217224, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.96480178833008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.79151916503906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.4827220439911, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.57679867744446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.49823999404907, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.5929594039917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.00640082359314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.96751880645752, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.54479598999023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.84208130836487, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.13327860832214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.64624071121216, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.0177628993988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.38079929351807, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.2073621749878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.36735606193545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.5886380672455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.56352066993713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.75376057624817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.1084794998169, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.60223841667175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.6502411365509, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.4468812942505, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.03775882720947, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.4503996372223, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.9881603717804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.61855959892273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.05903887748718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.181759595871, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.9526391029358, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.75744104385376, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.7089557647705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 307.1668839454651, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5600032806396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 306.48687958717346, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.3556799888611, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 309.7987174987793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.0430335998535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 304.7865641117096, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.86544203758237, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.17824029922485, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.06832218170166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.1599998474121, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.82112193107605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.24799919128418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.28224182128906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.64639830589294, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.28495693206787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.66912055015564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.73392367362976, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.87312006950378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.76671957969666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.53999948501587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.1921606063843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.95040011405945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.00752067565918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.16256165504458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.0209617614746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.87872266769406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.6051208972931, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.7390389442444, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.25311923027039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.53503966331482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.9148817062378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.4718391895294, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.39551854133606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.84287881851196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.16207909584045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.4056005477905, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.1934413909912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.5265598297119, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.2599997520447, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.70944094657898, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.88944029808047, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.3438386917114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 303.8156771659851, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.8860809803009, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.76559948921204, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 331.9489586353302, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.9116792678833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.90032243728638, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.0103991031647, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 305.0382399559021, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 305.0651204586029, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.00223898887634, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.1060814857483, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 328.5153615474701, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.91376066207886, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.2166352272034, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.3091206550598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 304.85520124435425, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 308.750718832016, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.677122592926, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 281.07760071754456, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 333.1982409954071, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.9015965461731, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.69200110435486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.76496148109436, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 301.6969621181488, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 305.0539195537567, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.70336055755615, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 278.0448019504547, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 328.02639842033386, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 301.6195213794708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 330.54239869117737, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 326.04080080986023, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.1075246334076, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.969598531723, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.0534417629242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.14287996292114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 333.5860800743103, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 326.30671858787537, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.1630401611328, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.79167914390564, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.1092805862427, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 308.8918387889862, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 339.65872049331665, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 322.49824047088623, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.6551992893219, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.41216015815735, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.408322095871, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 310.00768065452576, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 341.06191754341125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 326.85360074043274, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.3158423900604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.6566393375397, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.7044794559479, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.16608119010925, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.8992009162903, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.1785578727722, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.07280254364014, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.02735900878906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.01648330688477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.1176047325134, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.0009641647339, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.0222396850586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.36479878425598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.6985614299774, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.03632164001465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.51679849624634, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.28448057174685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.2667179107666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.8003215789795, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.69983983039856, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.87807965278625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.2582411766052, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.89519906044006, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.10096311569214, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.03392028808594, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.64735555648804, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.8367967605591, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.8977611064911, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.3278419971466, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.69487833976746, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.6022391319275, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 279.27183985710144, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.77632308006287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.29984068870544, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.9742386341095, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.2351987361908, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.29807662963867, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.14720249176025, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.1451184749603, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 283.33136081695557, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.3596796989441, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.55439949035645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.05647897720337, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 317.56208062171936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.6779193878174, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 349.53311800956726, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 283.9254403114319, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 328.9796793460846, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.03280091285706, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.37903928756714, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.36831855773926, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 315.91487884521484, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 295.1248002052307, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.2944006919861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.6284821033478, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 320.6968021392822, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.4179184436798, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.3081605434418, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.4956798553467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.79231977462769, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.16111850738525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.3575987815857, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.0916805267334, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.85792088508606, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.7716805934906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.6688003540039, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.910560131073, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.87888169288635, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.35568070411685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.7852802276611, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.56767964363098, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.1105630397797, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.27679896354678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.36527681350708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.58559846878052, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.32384085655212, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.81376194953918, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.10688066482544, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.9110412597656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.42976093292236, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.25695967674255, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.9756810665131, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.64143919944763, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 297.15904116630554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.2817602157593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 295.2950417995453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.0811195373535, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 295.78351974487305, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 291.86432123184204, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.31808042526245, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.034880399704, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.79551696777344, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.22127890586853, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.48528027534485, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.64159870147705, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.0388813018799, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.42656111717224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.20639967918396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.4167950153351, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.5758419036865, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.62799644470215, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.1078460216522, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.31056451797485, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.94576358795166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 411.25648260116577, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.98624205589294, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.9478392601013, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.14416551589966, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 428.6241555213928, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 381.84320092201233, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.6214380264282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.0388813018799, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 428.0851221084595, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.11967873573303, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.59983706474304, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.97088146209717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.95503854751587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.623681306839, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.19296288490295, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.2398376464844, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.88240218162537, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.4703993797302, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.3279995918274, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.9094362258911, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 431.24783754348755, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.7156789302826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.02800011634827, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.70239877700806, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.4281632900238, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.055522441864, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.9203209877014, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.6140785217285, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.5257587432861, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.0452761650085, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5503988265991, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.0516810417175, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.5406398773193, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.0111994743347, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6547193527222, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.8593578338623, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 306.2324810028076, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 286.6699206829071, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 335.2665615081787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.46751976013184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 278.62751960754395, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 344.0990400314331, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 319.4875192642212, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.9278407096863, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 340.2192008495331, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.33520126342773, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.98943758010864, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 344.2724812030792, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 314.9502408504486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.36911940574646, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 336.9374406337738, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 315.5336010456085, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 286.7252838611603, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.9003186225891, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.473760843277, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.367520570755, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 334.7265613079071, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 299.4095981121063, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.95967984199524, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 414.65823888778687, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.8539206981659, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.7694363594055, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.9126410484314, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 414.7548806667328, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.22975873947144, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 427.466082572937, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.46880197525024, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.79456090927124, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 298.4494411945343, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.0571210384369, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.117280960083, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.3257601261139, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 300.56959986686707, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.30672001838684, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.8596785068512, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.7779190540314, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 300.81615924835205, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.26879811286926, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.8436770439148, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.302081823349, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 297.9479992389679, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.8054401874542, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.52944111824036, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1227.089433670044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 292.55151867866516, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1205.5846405029297, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 291.9147193431854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.11039352417, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.8095977306366, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1217.0462322235107, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 286.5070390701294, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 888.1289672851562, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.9308776855469, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 886.5963172912598, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.52991771698, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 904.5785570144653, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.4401574134827, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 913.9846467971802, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.3406386375427, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.2273540496826, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.5999975204468, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 893.1643295288086, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.8686375617981, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.8012704849243, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.7043180465698, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 911.9380807876587, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.6737604141235, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 922.434720993042, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.7742404937744, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 966.8311977386475, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.282398223877, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 902.4003148078918, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.1715264320374, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 911.4887952804565, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.7982449531555, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3696.5366554260254, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.8011236190796, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3732.1895599365234, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.22015714645386, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3699.6262168884277, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.662082195282, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3706.461296081543, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.19007635116577, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.9563217163086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.55840396881104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.3790431022644, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.49184131622314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.8371181488037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.13024044036865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.67936658859253, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.29919958114624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.3075189590454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.582558631897, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.28384256362915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.3950409889221, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.87712049484253, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.2438397407532, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.7696056365967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.75872230529785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.2409596443176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.92655992507935, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.6607995033264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.481764793396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.4785614013672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.8153614997864, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.21663761138916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.93391704559326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.1708827018738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.7081604003906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.4312014579773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.1222414970398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.8798394203186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.2542381286621, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.88272428512573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.95919704437256, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.57552576065063, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.070077419281, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.23888540267944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.29759550094604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.48527574539185, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.8566403388977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.7209553718567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.8857612609863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.2379183769226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.0705609321594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.711040019989, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.6214370727539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.99391746520996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.7171139717102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.97216176986694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.5228786468506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.865918636322, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.0990409851074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.3924775123596, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.5243248939514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.476318359375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.3304009437561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.0241599082947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.72800636291504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.6454372406006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.13855934143066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.2596802711487, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.92607593536377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.67551851272583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.6871991157532, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.56575775146484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.3785548210144, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.9990358352661, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.39296531677246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.066400051117, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.92351770401, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5449628829956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8848032951355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.401282787323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.1929602622986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.7252793312073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.19904041290283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.9563179016113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.3228807449341, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.9796743392944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.3915224075317, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.7502398490906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.7787222862244, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.8779225349426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.3395233154297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.1052803993225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.9151916503906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8948850631714, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.8580799102783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.8652801513672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.5963191986084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.16176652908325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.57024002075195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.5827188491821, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.4780769348145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.6632022857666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.9329562187195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.5542378425598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.6558456420898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.94880199432373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.8142399787903, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.0744061470032, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.7159991264343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.7987289428711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.212797164917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.98303508758545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.1723213195801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.4704008102417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.0047974586487, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.35472202301025, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.5628786087036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.4870457649231, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.09455823898315, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.3423991203308, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.0419225692749, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.9107208251953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.50063848495483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 431.0747194290161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.21760272979736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.9652853012085, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.507200717926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.983521938324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.9140787124634, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.6187205314636, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.0563235282898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.42288064956665, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.0275197029114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.83583784103394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.53711891174316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.161283493042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.2231955528259, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.5696039199829, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.108962059021, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.71615505218506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.38416624069214, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.7817621231079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.5774350166321, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.7136034965515, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.8428831100464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.72543573379517, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.66991996765137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.3833613395691, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.08656215667725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.93136405944824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.4388747215271, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.25920009613037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.52256441116333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.084321975708, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.54383850097656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.1487979888916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.73967456817627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.51216411590576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.5579180717468, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.67664527893066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.00431728363037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.1380772590637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.94896268844604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.6292796134949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.11967945098877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.18064069747925, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.40784072875977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.4022407531738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.1137537956238, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.3993601799011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.7697620391846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.1428818702698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.1088018417358, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.4974431991577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.7798466682434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.57647609710693, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9835209846497, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.5660786628723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.7955160140991, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.7660789489746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.9689598083496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.82480096817017, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.4470367431641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.72655391693115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3692812919617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.60896015167236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.3014392852783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.3415951728821, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.30239725112915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.74992275238037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.39855670928955, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.07999420166016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.2233581542969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.44000339508057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.57216024398804, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.8572793006897, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.29952239990234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.784321308136, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.8839955329895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.2393593788147, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.8023953437805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.40768480300903, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.9350414276123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.70128059387207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.34560108184814, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.57952308654785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.24032258987427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.06319522857666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.0708808898926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.7772798538208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.1243152618408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.14335918426514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.3484802246094, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.5487937927246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.39183950424194, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.69983768463135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.4884886741638, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.1195139884949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.1302423477173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.23327255249023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.9713616371155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.87023878097534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.13808393478394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.71343994140625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.7079977989197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.4326367378235, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.6467170715332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.18144035339355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.6388816833496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.1617622375488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.2748794555664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.4883222579956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.4385576248169, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.05888080596924, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.13343477249146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.4577589035034, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.0025644302368, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.91552114486694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.81663751602173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.81791830062866, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.4726357460022, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 767.197916507721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.082079410553, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 786.714243888855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7886385917664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.852481842041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.534722328186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.7598419189453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.1523246765137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.7343945503235, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.99600315093994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.84351873397827, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.9803171157837, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.1148781776428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.311044216156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.8899164199829, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.68144130706787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.30592012405396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.47135972976685, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.0942358970642, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.5675196647644, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.08352184295654, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.49744272232056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.17920207977295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.28320121765137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.6542434692383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.9796848297119, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.86656188964844, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.37951707839966, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.70063638687134, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.04495573043823, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.5516800880432, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.44208240509033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.1054368019104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.99567890167236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.7369589805603, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.05903673171997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.82495975494385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.3484797477722, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.03663635253906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.28352975845337, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.36192178726196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.17983865737915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.0140805244446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.83839750289917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.83168029785156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.1084842681885, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.49359607696533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.62848377227783, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.5153613090515, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.2751979827881, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.63120794296265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.09839391708374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.91007947921753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.05375576019287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.8743968009949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.2927956581116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.1227169036865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.6068768501282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.54928064346313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.59391927719116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.27872133255005, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.84575605392456, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.6030378341675, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.4520010948181, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.05903911590576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.8240032196045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.7820816040039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.8951997756958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9270401000977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.4100775718689, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.1683259010315, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.6390390396118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.0980763435364, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.1552023887634, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.9057626724243, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.018883228302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.23583698272705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.84751892089844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.3276786804199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.6649580001831, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.94271993637085, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.0824017524719, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.14528036117554, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.624963760376, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.932963848114, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.85599422454834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.07487392425537, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.0648012161255, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.6894392967224, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.2630443572998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.29567766189575, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.5257577896118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.36144256591797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.9403233528137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.4174361228943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.61487865448, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.59791803359985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.4643168449402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.6046404838562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.8256068229675, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.37983560562134, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.76239585876465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.8902382850647, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.6880025863647, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.0209550857544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 727.4769568443298, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.9939169883728, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.1294412612915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.8537578582764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.065755367279, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.6790347099304, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.3268814086914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.7532811164856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.6017642021179, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.5286350250244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.3379249572754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.78639793396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.7025604248047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 760.4059147834778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4943985939026, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.1902389526367, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.8574376106262, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.1652812957764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.5695986747742, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.905595779419, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.0817589759827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.8324847221375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 689.0115189552307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.53136444091797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.0795192718506, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.57920026779175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.50063610076904, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.73280477523804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.6331238746643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.6124768257141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.79504013061523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.2592034339905, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.07007360458374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.1156802177429, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.1540832519531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.4987201690674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.5198383331299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.9540796279907, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.0764765739441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.64927768707275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.32623958587646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.90144443511963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.75680017471313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.53199768066406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.6596760749817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.64463901519775, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.5447964668274, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.0244812965393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.6344027519226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.3812837600708, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.2035217285156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.9411211013794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.7512001991272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.03040409088135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.6561574935913, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.0724835395813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.797119140625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.79487705230713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.45487642288214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.6020760536194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.81135845184326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.89231395721436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.06767749786377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.3732786178589, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.8678421974182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.5870451927185, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3310375213623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.16400051116943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.76976346969604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.3358373641968, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.9075255393982, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.6555185317993, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.9603171348572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.142080783844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.79616403579706, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.35599279403687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.23088359832764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.0662384033203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.3123264312744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.25360012054443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.5620756149292, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.20159816741943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.69071865081787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.54159784317017, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.4303970336914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.38575553894043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.99744176864624, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5207953453064, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.3891205787659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.4177598953247, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1387257575989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.3819231987, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.8014478683472, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.7635231018066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.9121556282043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.970883846283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 683.9639925956726, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.5492835044861, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.8779239654541, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.4555177688599, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.0505647659302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.4040040969849, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.1529560089111, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.3172812461853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.41279888153076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.99872398376465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.2251172065735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.52207708358765, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.9119944572449, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.9054388999939, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.94224309921265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.6446418762207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.9697642326355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.2275228500366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.8505549430847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.96239948272705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.3331198692322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.59007930755615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.3022389411926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.8209595680237, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.61152029037476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.81071853637695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.35375785827637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.0558409690857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.8596806526184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.2358388900757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.249764919281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.3332781791687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.341121673584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.9678440093994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.4801607131958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.36239767074585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.50975799560547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.04368591308594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.18607902526855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.6376004219055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.5953550338745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.16863918304443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.0497603416443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.24015522003174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.57952308654785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.3806447982788, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.2219228744507, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.00928258895874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.09008026123047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.334077835083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.5054392814636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.3097596168518, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.6617579460144, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.7420792579651, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.54159450531006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 918.4292793273926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.1011204719543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.8967962265015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.6216015815735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 915.5791997909546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.8563194274902, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 949.522876739502, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.990403175354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.8790340423584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.780478477478, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.468638420105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.9583992958069, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.23247957229614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.21184253692627, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.02352380752563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.8542437553406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.990882396698, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.1280016899109, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.9084792137146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.7408013343811, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.29120206832886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.7112030982971, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.6505618095398, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.3475275039673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.54944133758545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.9099168777466, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.804160118103, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.77951860427856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.10079860687256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.08528327941895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.9727964401245, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.8804793357849, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.1912021636963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.6355199813843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.6153607368469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.38479709625244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.7116723060608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.3585591316223, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.933916091919, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.7038416862488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.1604833602905, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.399516582489, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.7622413635254, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.7667169570923, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.792311668396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.26416206359863, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.0612840652465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.1763205528259, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.38415813446045, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.4963173866272, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.256959438324, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.0512022972107, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.8968005180359, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.2931170463562, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.3188824653625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.2918357849121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.6201639175415, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.13759899139404, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.5639977455139, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.2959895133972, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.8947200775146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.170560836792, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2878403663635, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.8300733566284, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.7235188484192, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.28463888168335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.5547194480896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3111972808838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.795202255249, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.3476800918579, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.31280183792114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.0838356018066, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.5984034538269, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.5651206970215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.0366430282593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.6552014350891, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.0913577079773, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.4553623199463, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.8739199638367, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.5588803291321, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.2456030845642, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.9768013954163, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.5862407684326, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.1004781723022, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.2425637245178, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.4713606834412, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.7796792984009, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.8694458007812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.1110420227051, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.9463987350464, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.807354927063, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.5708832740784, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.5168023109436, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.2422413825989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.4414420127869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.8063998222351, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.9510402679443, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 867.6070356369019, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 867.175350189209, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 876.607837677002, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 880.0158500671387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.2206449508667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 856.5936088562012, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.6395177841187, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.28239822387695, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.6736030578613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.68272161483765, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.80880212783813, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.1408009529114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.2080044746399, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.9041557312012, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.17791748046875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.8457589149475, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.82864809036255, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.9638395309448, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.3984022140503, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.29583501815796, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.7081651687622, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.9934401512146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0163216590882, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.99520111083984, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.1779217720032, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.1622462272644, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.7960014343262, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.2519965171814, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.47680377960205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.38112020492554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.260639667511, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.0785593986511, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.2048010826111, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.44480419158936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.65999603271484, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.5547204017639, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.12479972839355, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.7275195121765, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.0918426513672, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.1025576591492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.9486374855042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.7270374298096, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.3964776992798, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.0095992088318, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.7239999771118, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.3022375106812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.382559299469, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.8540754318237, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.692325592041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.0692849159241, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.1390419006348, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.959520816803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6307163238525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.6734457015991, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7107257843018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.9980821609497, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.27535247802734, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.18943786621094, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.12784004211426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.7944059371948, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.93055868148804, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.1607995033264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.3782424926758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.37776231765747, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.69552659988403, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.3015990257263, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.2847990989685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.3799991607666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.1166386604309, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.84160137176514, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.42159605026245, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.3630437850952, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.49312019348145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.51471996307373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.1001687049866, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.26128482818604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.3609666824341, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.0411195755005, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.81823539733887, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.16399765014654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.3020796775818, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2923192977905, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5092759132385, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7312026023865, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.1987190246582, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.5940837860107, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.9623975753784, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.7955207824707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.1281614303589, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.7175970077515, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.3671932220459, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.8595190048218, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.1315155029297, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.7591972351074, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.56207752227783, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 835.6031894683838, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.4662485122681, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.6188836097717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.6275200843811, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.7331228256226, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.6758451461792, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.2902393341064, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.7976050376892, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.5948877334595, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 830.9124898910522, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.8158373832703, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.8008046150208, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 858.5790395736694, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 799.6700763702393, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.0625596046448, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.6270413398743, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.1521549224854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 823.2535982131958, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.7518391609192, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.2433576583862, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 832.1303987503052, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 800.4521584510803, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 722.0651245117188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 720.3507232666016, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 836.7769598960876, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.2064108848572, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.5403218269348, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 721.1760020256042, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 851.9001770019531, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.8185601234436, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.136962890625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 730.6113648414612, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1014.4359922409059, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.2935991287231, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.8619184494017, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 996.4471960067749, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1036.9926309585571, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1004.6833562850952, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1045.6390380859375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 998.192629814148, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.0297622680664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.181914806366, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.3636798858643, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.2863955497742, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.2203235626221, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.9193625450135, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.9127955436707, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.5395121574402, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.615843296051, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.1145558357239, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.5742311477661, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.4755244255066, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.7340793609619, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.1411185264587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.6751928329468, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.7374377250671, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.435516834259, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5256028175354, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.6639986038208, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.5617570877075, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.7862424850464, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.3012833595276, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.8846440315247, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.50416231155396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.8908820152283, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.452962398529, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 785.0726413726807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 676.9409680366516, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 791.2734389305115, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.1209597587585, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.2348818778992, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 684.179515838623, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.5678429603577, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.069598197937, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.568642616272, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.3299198150635, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.6228814125061, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.4126400947571, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.7259168624878, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.2532777786255, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.3606419563293, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.1836829185486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.3316802978516, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.53648042678833, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.3808007240295, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6769647598267, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.5051217079163, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.65295696258545, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2353.8430309295654, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.8521604537964, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2359.606056213379, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.381917476654, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2369.126396179199, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.8824033737183, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2365.816173553467, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.8760004043579, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1705.1086330413818, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1187.0796871185303, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1709.7920036315918, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1207.513279914856, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1706.3822412490845, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1214.5001649856567, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1729.2284870147705, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1204.6752071380615, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1726.354742050171, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1190.2595043182373, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1732.1048164367676, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1186.7190408706665, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1709.7470474243164, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1185.4171133041382, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1713.63920211792, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1217.8872060775757, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1715.4060745239258, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1068.0207920074463, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1724.0322875976562, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1064.9443197250366, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1721.7067241668701, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1049.4910383224487, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1743.8265705108643, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.1158361434937, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6408.664588928223, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 815.3313589096069, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6532.301502227783, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.5982403755188, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6358.906688690186, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.1980857849121, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6353.005447387695, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 841.2091159820557, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.0852756500244, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.96560621261597, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.0860848426819, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.1376008987427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7555198669434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.1956729888916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.4433612823487, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.44815540313726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.150399684906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.0355176925659, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.43343925476074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.1057562828064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6828765869141, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.20159626007074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.204161643982, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.87119865417486, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.4080033302307, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.6735987663269, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.88768100738525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.0135974884033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7760000228882, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.6187224388122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.01408386230474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.68608140945435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.0433621406555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.7516837120056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.4718384742737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.5723304748535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.9081573486328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.06592035293585, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.4692831039429, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1873579025269, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.12127351760864, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.0223970413208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.32496213912964, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.9419169425965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1529603004456, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.21744441986084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.89936256408697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.1891169548035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.2335991859436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.1750416755676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.9977602958679, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9316725730896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.0199990272522, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.23136472702026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.0340785980224, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.8393588066101, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.8497619628906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.7232003211975, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.32144498825073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.3366394042969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.4559960365295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.1427221298218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3049631118774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.8111987113953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.55679845809937, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.9460768699646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.294397354126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.2887954711914, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.6310377120972, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.7464070320129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7935910224915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.8652720451355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6719999313354, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.0483255386353, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.5270366668701, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 699.3484783172607, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.7481603622437, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.3449606895447, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.8734374046326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 730.3188872337341, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.720639705658, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.4857611656189, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.7827181816101, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.4227209091187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.3163223266602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.9443206787109, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.7758402824402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 724.8345613479614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.0686411857605, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.6054368019104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 699.7318410873413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.5819187164307, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.231517791748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.835994720459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.2327971458435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.839198589325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.8398427963257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.193118095398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.4913573265076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.1529598236084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.0884766578674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1091213226318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.064160823822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.9113621711731, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.58448362350464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.8447952270508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.4459261894226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.73663902282715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.75567674636847, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.56704235076904, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.12464237213135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.7566428184509, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.83055925369257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.57632398605347, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.91792821884155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.99424171447754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.93760347366333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.45055770874023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.68336057662964, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.6025619506836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.051522731781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.7417573928833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.43215703964233, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.4963231086731, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.3262391090393, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.8684763908386, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.40191745758057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.0599961280823, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.69808626174927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.2529630661011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.89280462265015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.1897587776184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.74768114089966, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.26880168914795, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.31295919418335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.19312381744385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.6156873703003, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.93568181991577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.8990378379822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1467208862305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.9643177986145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.08560037612915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.385437965393, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.7334337234497, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.52720308303833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.60815620422363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9214396476746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9135975837708, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.98560380935663, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.01311779022217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.68336677551275, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.9588847160339, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.5678377151489, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.1737632751465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2985548973083, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.5193614959717, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.65968132019043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.51663637161255, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.5499176979065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1532826423645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.87968015670776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.9691219329834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.2126388549805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.8881559371948, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.8816003799439, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.63264179229736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3636770248413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.3644785881042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.1956796646118, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.080952167511, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6624007225037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.5057644844055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.6441602706909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.5587210655212, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9558339118958, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.396960735321, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.2228832244873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.907205581665, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.5827231407166, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.8382411003113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.0884766578674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.6764788627625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7270412445068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.233277797699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.5124821662903, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.7892827987671, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.0086398124695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.16544103622437, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.6416029930115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.19519901275635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.7753577232361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.5175995826721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.1102418899536, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.88047790527344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.1379222869873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.59039878845215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.0420804023743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.64144468307495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.0705590248108, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.71183919906616, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.72800159454346, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.91439628601074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.28063917160034, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.4681577682495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2731213569641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.6806421279907, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.23423957824707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.79680490493774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.79872035980225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.48015785217285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.9545621871948, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.79407596588135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2897605895996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.6164779663086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.4640016555786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.9468755722046, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.4727997779846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.5027174949646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.96047639846796, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.56031942367554, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.48560094833374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.9825611114502, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.60783863067627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.1030402183533, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.4342432022094, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.5916862487793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2044858932495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.70896196365356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.49471616745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.4334387779236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.93199586868286, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.98320150375366, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.86560440063477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.60128116607666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.64447879791265, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.744001865387, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.3035182952881, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.3958430290222, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.44208002090454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.0887985229492, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.906886100769, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.9582347869873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.1014375686646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.1659183502197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.7200040817261, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.0760040283203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 869.6676921844482, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.3676776885986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.78768014907837, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.96239852905273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.2587242126465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.489764213562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.5822443962097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.38720750808716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.8851227760315, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.0804810523987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.99680185317993, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.25840044021606, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.5697536468506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.9663972854614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.95616388320923, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.58015871047974, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.3919973373413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.17039251327515, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.234881401062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.96000051498413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.7324786186218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.10048151016235, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.99952459335327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.0616021156311, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.8316869735718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.04320192337036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.0323238372803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.9233589172363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.7528042793274, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.81664228439325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.3879933357239, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.72768545150757, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.854241847992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.83984327316284, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.6008014678955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.2598400115966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.09023904800415, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.21871566772455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.729278087616, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.27040338516235, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6631994247437, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3028817176819, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.19456005096436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5822310447693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.2023949623108, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.4051175117493, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.744161605835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.3804769515992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.448956489563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2846465110779, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.46095848083496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.48223924636835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.07024240493774, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.55775880813604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.2099237442017, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.2321605682373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2912049293518, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3030366897583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.72896242141724, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.7583951950073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.2696032524109, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.45199728012085, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.465916633606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.80944204330444, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4297552108765, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8574438095093, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.38479423522955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.62896394729614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6576013565063, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.303361415863, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.5648002624512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4457621574402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.4225606918335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.1566371917725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.72463893890387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.38607931137085, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.7014427185059, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.5222392082214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.4579191207886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.5759954452515, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.6089601516724, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.6377558708191, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.1385588645935, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.7558331489563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.7436771392822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.7686376571655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.9998397827148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8911991119385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.7579202651978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.1947197914124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.6977610588074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.45823907852173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.331202507019, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.0756840705872, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.8144016265869, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8078455924988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.0916843414307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.9478406906128, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.9201622009277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.7793612480164, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 826.6424036026001, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.1076788902283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 722.3720026016235, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.0847988128662, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.8398418426514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 803.6212778091431, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 829.3494415283203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.8339185714722, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.9435238838196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.683042049408, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.3646330833435, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 800.8910441398621, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.2955222129822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.8868818283081, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 734.4838428497314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.6835179328918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.7505531311035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 795.7335948944092, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 831.3246440887451, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.8571186065674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 731.4302444458008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.1179141998291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.72704219818115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.22863578796387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.07167768478394, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.3001594543457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.31008195877075, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.32496452331543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.17152070999146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.7907247543335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.2497601509094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.6894392967224, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.1008014678955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.0900812149048, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.82079696655273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.8907175064087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.8904004096985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.81584024429327, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.4513602256775, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.9657621383667, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.19792652130127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.8231997489929, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.09055852890015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.2382426261902, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.87568283081055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.51968050003046, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.94111680984497, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.9056010246277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.03504037857056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.32799768447876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.3825578689575, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.2289652824402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.18752098083496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.8846392631531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.4145579338074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.21376180648804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.9907178878784, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.2952003479004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7233514785767, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.4108853340149, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.8172783851624, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.9878387451172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.2219152450562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.44624423980713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.9891228675842, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1318402290344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4284768104553, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.9953546524047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.807680606842, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.0790395736694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.03600025177, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.6793584823608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.5267171859741, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.8039951324463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.1043186187744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9806389808654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.8462357521057, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.6281590461731, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.8124828338623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.98912191390986, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.9908757209778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.218234539032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.8076710700989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.4571237564087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.9916763305664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.9270415306091, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.6372828483582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.4788789749146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.2102379798889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.878080368042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.3063960075378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.6428799629211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.9772815704346, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.8224067687988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.6457567214966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.9532804489136, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.7401566505432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.6641621589661, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.66304063797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.433919429779, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.3926358222961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.829761505127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.0047998428345, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.7449617385864, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.08128213882446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.51520061492914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.3510456085205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.0348844528198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.78543853759766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.73440551757807, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.5628824234009, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.7692813873291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2723159790039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1662397384643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.6169595718384, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.05583572387695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.82895612716675, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.3371253013611, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.5660762786865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.5953674316406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.06799554824835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.3316817283631, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.16192150115967, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.27135276794434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.666241645813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.54416275024414, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.2276768684387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.17775774002075, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.04383945465094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.5272011756897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.2217574119568, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.9835205078125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.8321557044983, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.53727436065674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.9332790374756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.8271994590759, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.1246418952942, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7857508659363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.40608310699463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.9288001060486, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1668796539307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5225591659546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.7696051597595, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.3644866943359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.249762058258, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.0703997612, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.35423946380615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.4539179801941, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.3895974159241, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4971203804016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1055.4275178909302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.9315228462219, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.2361612319946, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.6897597312927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1055.867838859558, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.4671940803528, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1061.0974502563477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.7521634101868, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.52943992614746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.08495855331427, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.1392059326172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1971244812011, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.5630431175232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.80399799346924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.97360038757324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.8222389221192, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.5934357643127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.1843147277832, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.6988787651062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.0335998535156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.70464277267456, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.7904000282287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.380163192749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.1184000968933, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.10960149765015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.47471618652344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.03136253356934, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.75695562362665, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8500752449036, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.1115159988403, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.5495972633362, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.72704124450684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.0276794433594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.90224170684814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.6412787437439, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.24688196182257, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 699.6337628364563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.5758428573608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.2160019874573, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.3816027641297, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.4918427467346, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.9278435707092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.0838398933411, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.234236240387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.9329586029053, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.7140774726868, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.7951965332031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.8648018836975, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.5775990486145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.6791982650757, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.0948805809021, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.0353574752808, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.3470392227173, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.4479994773865, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.0177612304688, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.5779228210449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1201601028442, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.35280227661127, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4374408721924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.2092814445496, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.455677986145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.1811218261719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4419178962708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.8161606788635, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.9446401596069, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8720011711121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5518345832825, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.1255984306335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.2588815689087, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5081572532654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.2849626541138, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.2387142181396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.9342470169067, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.4647974967957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.7521586418152, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.4576063156128, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.3558435440063, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.9828791618347, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.5409641265869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.6811218261719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.7537617683411, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.9540824890137, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 730.9065628051758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 718.0647993087769, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.320164680481, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.039514541626, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.2228770256042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.3521609306335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.824960231781, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.3262419700623, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.501118183136, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.5208015441895, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.878876209259, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.4704031944275, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.6801552772522, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.2984027862549, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 940.7623863220215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.5958366394043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 953.7668800354004, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 951.1556768417358, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 944.8329639434814, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 963.8323259353638, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.7395153045654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 963.7534379959106, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.2254385948181, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.4683165550232, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.3787198066711, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.4079999923706, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.2459173202515, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.318877696991, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.6484789848328, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.2289581298828, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5846428871155, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.2350363731384, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.6835241317749, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2571206092834, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.5548787117004, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.8451209068299, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.7647972106934, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.2681574821472, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.0959987640381, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.92512702941895, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2835206985474, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1003170013428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.8113651275635, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.850239276886, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.3204736709595, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.534722328186, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2062406539917, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.7150421142578, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.4735984802246, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.7067203521729, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.6193552017212, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.5945534706116, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.5275173187256, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.3984007835388, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.6486411094666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.5971174240112, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 718.0782389640808, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.5731272697449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.7060770988464, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.7814388275146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.1334404945374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.5880002975464, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.5689625740051, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.8631939888, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 718.2345581054688, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.2115187644958, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.0481600761414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.1619215011597, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.7983980178833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.7308821678162, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.82367610931396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.340961933136, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.59951972961426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.330717086792, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.3159999847412, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.0375967025757, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.3964800834656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.955837726593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.5385570526123, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.7988820075989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.8028831481933, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.7795214653015, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.76224279403687, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5102415084839, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.5587229728699, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.95167875289917, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.45408296585083, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.69631719589233, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.9968008995056, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.6489644050598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.63295888900757, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.6289563179016, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.32287883758545, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.4457588195801, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.3753633499146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.6912040710449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.8660802841187, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.6347208023071, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.1766395568848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.0535988807678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.3782396316528, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.756959438324, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.949914932251, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4900794029236, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.9505658149719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.292640209198, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.938717842102, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1531229019165, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.5751953125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.895998954773, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.3916778564453, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 888.7593650817871, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.0353546142578, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 791.7734408378601, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 929.420166015625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 884.3414449691772, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.3147211074829, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 807.7310419082642, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.5884742736816, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 902.4166536331177, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 781.7185640335083, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 789.637439250946, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.4544010162354, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 887.3819255828857, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.3927998542786, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 809.4550561904907, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.191349029541, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 903.2862281799316, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.9759964942932, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.7872009277344, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.6812753677368, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 894.0708780288696, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 784.125759601593, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 787.0670342445374, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 929.6679878234863, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 903.9553642272949, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.4537601470947, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 789.2934417724609, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 930.4092741012573, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 892.2116899490356, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 789.1793632507324, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.3494372367859, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1137.5139141082764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1090.9420776367188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1137.4851179122925, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1101.7759990692139, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1130.6772804260254, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1092.7225637435913, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1142.1615934371948, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1100.2232074737549, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.1915183067322, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.0406422615051, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.7856040000916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.0784091949463, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.0051217079163, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.5140814781189, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.0745568275452, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.4980778694153, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.2331204414368, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.0476822853088, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.6044769287109, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.461594581604, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.7257614135742, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.5364837646484, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.5271973609924, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.1447939872742, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.4283208847046, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.2326436042786, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.593279838562, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.9751996994019, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.8041605949402, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.6540746688843, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.3017597198486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1187200546265, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 864.3463945388794, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.0249629020691, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 862.1806335449219, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.2153587341309, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 869.3523120880127, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.6206398010254, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 868.528323173523, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.3745565414429, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.306236743927, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.1201601028442, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.6222410202026, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.6719989776611, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.098560333252, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.1249570846558, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 689.7984051704407, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.312801361084, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.5449585914612, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.4169569015503, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.5604820251465, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.1217579841614, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.3001565933228, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.9342365264893, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.6128039360046, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.1451215744019, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2486.128807067871, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.3510394096375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2486.2731170654297, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.864963054657, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2493.8580989837646, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.494556427002, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2496.9319820404053, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.5116767883301, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1867.194709777832, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1290.5307245254517, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1900.1716804504395, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1321.0308742523193, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1893.198709487915, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1319.5684814453125, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1890.816307067871, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1333.3575963974, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1875.5905723571777, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1301.571192741394, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1901.9284629821777, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1325.803198814392, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1898.8699054718018, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1305.6180810928345, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1894.7088050842285, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1328.2320070266724, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1886.0566425323486, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.46928024292, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1887.835521697998, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1171.8750476837158, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1886.886568069458, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1172.1923303604126, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1892.0287895202637, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1172.137746810913, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7437.707328796387, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.2992143630981, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7446.624984741211, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.2612819671631, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7449.484100341797, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 951.5705585479736, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7462.863845825195, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 963.9700746536255, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.4683265686035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.8812799453736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.2039957046509, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.8849558830262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.535041809082, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.6169619560241, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2484784126282, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2203259468079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.99952030181885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.2609548568726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.0217633247376, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.69744157791143, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.3819198608398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.03440093994146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.8193626403809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.5635228157043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.5350422859192, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.783679485321, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.25088262557983, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.3147230148315, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.6255970001221, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.92319965362555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.0958437919617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8707165718079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.5857620239258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.54591274261475, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.5342445373535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.8804788589477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.7443189620972, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.62703847885126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.136960029602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.5027170181274, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.5225553512573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.8708806037903, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.98223972320557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.7809634208679, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.8465633392334, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1214451789856, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.06112241745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.9718308448792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.25199985504156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.18080043792725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.1675229072571, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7388806343079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.5206365585327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7491164207458, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.0622401237488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.7001576423645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.76000070571905, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.197283744812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.6582384109497, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.6271958351135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.5048017501831, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.8639969825745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.4891223907471, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.8833632469177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.92848157882685, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.8481554985046, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.08863735198975, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2897582054138, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.8878445625305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4760003089905, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.6807999610901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.4735999107361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.0507230758667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.1046380996704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.5580816268921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 727.6252770423889, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.7937569618225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.9860825538635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.1772804260254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.8801574707031, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.6022386550903, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.5713601112366, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.6812782287598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.1273603439331, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.0902342796326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.7324805259705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.580798625946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.8555135726929, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.4102449417114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.7283158302307, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.3057560920715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.580958366394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.8023977279663, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.2582440376282, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.271520614624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 756.7788791656494, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.7030410766602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.3348803520203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.9729561805725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 734.891197681427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.9734344482422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.4046382904053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.7936015129089, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 764.5177602767944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3163237571716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.20784091949463, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.74639987945557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.653603553772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.2846422195435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.91664266586304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.6219220161438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.8408017158508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1708860397339, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.87439918518066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.3262367248535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.40255975723267, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9654397964478, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.1000008583069, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.4718360900879, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1510405540466, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8039984703064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.7027220726013, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.2012801170349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.23839950561523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.1929655075073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.3583993911743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.53119945526123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.5385603904724, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8937606811523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.7992067337036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.3260793685913, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.04560232162476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.7900810241699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.5387210845947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.19727754592896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.35712242126465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.2566428184509, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.8076796531677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.5062427520752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.4223985671997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.74303865432734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.8964829444885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.195041179657, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9041628837585, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.346875667572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.742883682251, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2823967933655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.3124761581421, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.0180788040161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.20256042480474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.4777574539185, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.8342337608337, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.34767723083496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.69407510757446, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.0776033401489, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5623970031738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.295841217041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.77536582946783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6928019523621, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7587199211121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.7124800682068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.0947203636169, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.5115194320679, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.4115238189697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.649441242218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.77664709091187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.9239993095398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.5728025436401, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.7089648246765, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.0392031669617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2116832733154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.5271978378296, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.3435215950012, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.3740773200989, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.6324815750122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.4326395988464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.4683208465576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.4478397369385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.2807984352112, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.4932799339294, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.4372835159302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.9358386993408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.9153552055359, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.4440026283264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.95216274261475, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.37168264389044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.95215940475464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.6353597640991, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.2968006134033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.33807945251465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.9230399131775, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.21999740600586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.9783983230591, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5326375961303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.7496008872986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.99999380111694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.1078386306763, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.2721600532532, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.1339225769043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.41776323318476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.6593608856201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.3484783172607, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.18656301498413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.54144525527954, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.8801574707031, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0513668060303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.6087999343872, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.4169602394104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.27823972702026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.0735998153687, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.24592208862305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.59504032135004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.9980845451355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.8708758354187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.70400524139404, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5147199630737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.859034538269, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.0708780288696, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.243043422699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.9550361633301, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.24896287918085, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0473589897155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.9753608703613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.4312000274658, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4337573051453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.00447845458984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.1342363357544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.1036796569824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4849605560303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.468475818634, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4718384742737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.40480041503906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4371123313904, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.7673602104187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.7313566207886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.2281656265259, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.5443224906921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.30847787857056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.067202091217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.25247716903687, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 873.9595222473145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.3214464187622, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.6593608856201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.8228802680969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 878.1020879745483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.2180762290955, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 880.4219150543213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.1100835800171, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.27296209335327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1137628555298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.2151994705201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.0030369758606, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.88592195510864, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.97920179367065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.8961606025696, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.6764802932739, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.596800327301, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.99375581741333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.06112337112427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.85072040557867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.7419214248657, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.77120208740234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.08976078033453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.7332777976989, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.28240728378296, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.6641597747803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.22351980209345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.2849593162536, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.53344202041626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.03599643707275, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.4019227027893, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.38560771942144, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.8268775939941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.10064029693604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.7129611968994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.7127995491028, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.9963202476501, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.41280174255365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.0297555923462, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.87760210037237, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.4785590171814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.77696561813354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.9913630485534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.2907176017761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.331042766571, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.2971234321594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4036812782288, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.7142324447632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.8615975379944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.179039478302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4752068519592, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.0612831115723, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.9600014686584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.4860768318176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8635258674622, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.8441586494446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.6832003593444, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.8817615509034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2287936210632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4427247047424, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.7012791633606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.8084826469421, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.4262442588806, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.8625593185425, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.3200030326843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.30128383636475, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.8518390655518, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5222420692444, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.7392001152039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6697607040405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.3364782333374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.8494448661804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.3870453834534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3788776397705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2649564743042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.3494372367859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.9249663352966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.6672039031982, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.4611210823059, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.7062420845032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.6132788658142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.6665630340576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5395140647888, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.4452862739563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1038355827332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.5643196105957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.0839996337891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.912317276001, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2507171630859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.827995300293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.1527991294861, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.2779188156128, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.7446374893188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.9292783737183, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.7076783180237, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 676.8953657150269, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.4212794303894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8569560050964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.6057634353638, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.5054368972778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.1630387306213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7710385322571, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.9713578224182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.9702415466309, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.8340802192688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 815.1260709762573, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 849.7894334793091, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.2385592460632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 748.7366414070129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.6025590896606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.8425626754761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.4918355941772, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.1739177703857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.3099236488342, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.4212837219238, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 774.86896276474, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.1727957725525, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.2684774398804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.7425603866577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.7608008384705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.967839717865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.8961625099182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.0735983848572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 834.2803192138672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.4587154388428, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.2123212814331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 756.9198393821716, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 784.0535974502563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.164475440979, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.27888107299805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.1497611999512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5118398666382, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.98335886001587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.20271825790405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.33328008651733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.408317565918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.47759771347046, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.7579164505005, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.1075191497803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.4516825675964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.7830386161805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.10864782333374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.3663954734802, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.5196771621704, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.7567992210388, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.31120014190674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.8830361366272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3494429588318, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.59376192092896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.0660786628723, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.94495820999146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8851180076599, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.38991832733154, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.5830411911011, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.67184019088745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1459274291992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.78336334228516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.93295907974243, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.28384017944336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.3361630439758, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.2728028297424, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.6196827888489, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.5419187545776, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.9947237968445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.6750349998474, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.8462347984314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.8102374076843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.0745620727539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.0027122497559, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4966340065002, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.5771150588989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.0299229621887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.1224040985107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2390370368958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.3196802139282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.1967968940735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.1902418136597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.3068809509277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.1159982681274, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.3673648834229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.8622407913208, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.9343981742859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.4081563949585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.5344014167786, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.392156124115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4068827629089, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.2889575958252, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.9097609519958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.9139218330383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.9636859893799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.0587229728699, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.2937593460083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.913122177124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.0183973312378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.3556823730469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.4774346351624, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.9942388534546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 756.0206389427185, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.5699229240417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.0072040557861, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.5899243354797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.4566445350647, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.6368045806885, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.2313628196716, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.2142376899719, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.5833673477173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.510404586792, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.4208011627197, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.1846413612366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.8795166015625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.7809553146362, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5174403190613, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2390375137329, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.6062397956848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9942407608032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2299199104309, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.2062368392944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.08799934387207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.2209596633911, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.283196926117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.86656570434576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.8977608680725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.5569634437561, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.0595188140869, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.9227194786072, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.745436668396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.797435760498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2140765190125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0139255523681, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.3289575576782, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.31904363632196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.208963394165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.382077217102, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.26863384246826, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.9175972938538, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.4651203155518, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.64863872528076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.02175855636597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.47824239730835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.1793541908264, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.87583684921265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.739360332489, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.9312014579773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.6950354576111, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.1539249420166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.8056025505066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5804824829102, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.349123954773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.3574371337891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.0881609916687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.8896050453186, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.6312022209167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.5390415191651, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3262414932251, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.6239976882935, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.5355200767517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1065.3275203704834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.1688051223755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1065.5456018447876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.2036776542664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1066.7499113082886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.6606431007385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1070.8376026153564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.6713600158691, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.1280026435852, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.5576062202454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.0049605369568, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3812808990479, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.12672090530396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.42480421066284, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.9694356918335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.8934421539307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.0449600219727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9678421020508, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.0359992980957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.1976022720337, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1160001754761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4071955680847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.7804846763611, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.9694423675537, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.7489619255066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.1625638008118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.480637550354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2881603240967, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.7779173851013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.6758422851562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.71952056884766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.9921536445617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.597761631012, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7950415611267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.3737607002258, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.8225626945496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.4260787963867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.4784049987793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.6713557243347, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8142409324646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.1977610588074, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.2214407920837, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.7790384292603, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.812958240509, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.4497566223145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.1145629882812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.3395204544067, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.2824048995972, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.9204816818237, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.5564775466919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7796792984009, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.9665613174438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.3696007728577, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.6177606582642, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.3550367355347, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 683.5390400886536, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.5329623222351, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6483225822449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.0748791694641, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.3558411598206, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.6289596557617, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.991204738617, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.2774410247803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.7950401306152, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.5104055404663, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.3081617355347, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.029914855957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.6945600509644, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.889760017395, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.8529539108276, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.8329563140869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 686.0369610786438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.366231918335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.181921005249, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.2339253425598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.4809565544128, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.0009617805481, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 736.0620784759521, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.1799988746643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 716.3363218307495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.030562877655, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.0123243331909, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 756.300802230835, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.7283225059509, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.6806416511536, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.4948744773865, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.5555191040039, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.2014403343201, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 748.2766389846802, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.362238407135, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.7193627357483, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.5756778717041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.622880935669, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.4641585350037, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.6007976531982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.4745578765869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 967.0486497879028, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 981.858081817627, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 967.1417617797852, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 984.5600080490112, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 977.3425722122192, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 993.8640022277832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 973.7819147109985, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 986.3534450531006, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.4427223205566, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.7827177047729, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.357922077179, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9599962234497, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.480486869812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8219218254089, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.4051179885864, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.8777604103088, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.3100790977478, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.51216173171997, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.5694403648376, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.4828805923462, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.0248031616211, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3713603019714, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.4503998756409, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.5815968513489, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.5576004981995, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.74351882934565, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.2067174911499, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.5990414619446, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.411835193634, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.5153579711914, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.1009631156921, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.5553612709045, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6894435882568, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.0027203559876, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.039840221405, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.4851202964783, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.0019197463989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.6339201927185, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.1155195236206, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.5940766334534, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.7227206230164, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.7873635292053, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.6532821655273, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.4718379974365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.348482131958, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.0403227806091, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.6062397956848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.9921607971191, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.3544011116028, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4743933677673, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 730.7888007164001, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.490879535675, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.1633596420288, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.009759426117, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.9177632331848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.3723225593567, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.5254368782044, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.2932877540588, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.3552031517029, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.6464009284973, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.9086427688599, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.0392036437988, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.6619186401368, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.1876864433289, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.60911846160883, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.2755198478699, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.253764629364, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.6262397766114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.22016048431396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.2065658569336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.056797504425, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.15903711318964, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2193646430969, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.51519918441767, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.41839694976807, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8806443214417, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.24096345901495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.00079870224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.11279296875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.0025644302368, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.5606412887573, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.5956768989563, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.9220786094666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.1953620910645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.1167960166931, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.9855990409851, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.8329563140869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.1136012077332, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.9009585380554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.4697608947754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.7167973518372, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.9011149406433, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.1464066505432, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.9596853256226, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.6259198188782, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.7163186073303, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.8788795471191, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.3723182678223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.2215976715088, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.4881620407104, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 937.4750423431396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 910.9724760055542, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.9476799964905, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 824.2094421386719, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.301760673523, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.9924745559692, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 803.9201641082764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.0571212768555, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 937.4425601959229, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 917.7323198318481, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 807.1347188949585, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.3014450073242, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.6163158416748, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.038722038269, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 807.0151948928833, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.3988752365112, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.3508749008179, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 909.0921545028687, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 812.5785684585571, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 828.8521575927734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 935.1947164535522, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 937.8656005859375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 807.895359992981, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.254716873169, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 934.7740745544434, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 913.4953594207764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 808.8092851638794, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.6372833251953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1162.8575944900513, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1116.4334392547607, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1167.4163103103638, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1125.512957572937, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1170.0452756881714, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1124.5208024978638, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.3548774719238, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1126.8772792816162, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.5449638366699, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.9435224533081, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.5649628639221, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.4172763824463, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.9807939529419, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.181441783905, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.1183996200562, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.6353578567505, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.673436164856, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.9851198196411, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.6353588104248, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.5921626091003, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.7888011932373, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.161762714386, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.8716778755188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.1267204284668, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.9352049827576, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.2806377410889, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.553759098053, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.0364799499512, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.9164791107178, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.9308862686157, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.1262383460999, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.6832041740417, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.9193496704102, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.7996706962585, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.9024000167847, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.686719417572, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 879.5030403137207, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.5804800987244, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.535831451416, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.1651167869568, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 768.022563457489, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.7870440483093, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 701.6942381858826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.4657578468323, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 773.1475162506104, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.4145665168762, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.575680732727, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.0785593986511, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 774.6984004974365, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.0764765739441, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.1355142593384, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.4337606430054, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.0345640182495, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.5566358566284, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.136157989502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.943356513977, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2508.334894180298, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.6436805725098, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2514.455032348633, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.0075182914734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2523.786735534668, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.0100803375244, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2527.8945636749268, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.1134457588196, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1932.8961658477783, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1341.5974426269531, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1942.062873840332, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1347.3326444625854, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1935.0201511383057, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1346.166877746582, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1949.9556732177734, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1365.85120677948, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1933.2398414611816, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1348.529920578003, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1949.1075229644775, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1369.346890449524, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1938.2436752319336, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1351.6355180740356, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1949.699535369873, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1366.9753646850586, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1873.7366390228271, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1179.4961547851562, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1870.349416732788, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1183.728632926941, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1871.7929649353027, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1183.046236038208, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1880.8094501495361, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1183.2462406158447, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7475.730094909668, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 969.1713571548462, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7479.1412353515625, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.0225601196289, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7483.934288024902, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 964.6747159957886, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7496.789436340332, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 971.7057609558105, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.6713662147522, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.8851232528687, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.6027212142944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.5193600654602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.4046368598938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.1734399795532, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.0147247314453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.209282875061, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.1667170524597, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.7363204956055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.9700798988342, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.7659244537354, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.0740852355957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.6647968292236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.0038404464722, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.0582399368286, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.7942471504211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.1094398498535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.713595867157, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.670560836792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.5830335617065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.1243262290955, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.619038105011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.9561624526978, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.3785552978516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.0180811882019, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.2615966796875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.8390383720398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.2619218826294, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.9067144393921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.0630431175232, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.2393565177917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.4468803405762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.7148795127869, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.1697587966919, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.9476790428162, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.2892804145813, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.7883162498474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.1844792366028, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.7217602729797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.924159526825, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.1017599105835, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.7158427238464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.360643863678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 684.3721628189087, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.601279258728, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.3966360092163, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.652322769165, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.9367957115173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.0846381187439, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.2703990936279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.9737615585327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 684.6873593330383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.2043290138245, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.900803565979, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.6604833602905, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.9254412651062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.320484161377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.006402015686, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 637.566237449646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.8217625617981, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.6489610671997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.5377588272095, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.4278383255005, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.0580821037292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 720.7521629333496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1009.0684795379639, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.0921697616576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.5444717407227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 795.1929616928101, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1114.061918258667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1154.6329593658447, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.7137603759766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 720.7259202003479, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1016.3032007217407, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1034.8028755187988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.8511934280396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.7164750099182, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1117.9559993743896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1153.8620805740356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.6462388038635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.2187232971191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1001.5480089187623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1029.5339107513428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 815.8494400978088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 783.7220740318298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1108.8283157348633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.9406394958496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.6627163887024, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.2159967422485, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1005.9084796905518, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1040.8699131011963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 814.3107175827026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 794.0518426895142, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1116.924638748169, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1147.9395151138306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1267156600952, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.82159948349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.1087980270386, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.0385665893555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.9025611877441, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.1406364440918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.3328008651733, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.7372808456421, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.5174479484558, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.080801486969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.1790399551392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.952962398529, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.1627202033997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.3729600906372, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.1169624328613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.6204733848572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.0316743850708, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.03040599823, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.0777683258057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.1158423423767, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.1169624328613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.734076499939, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.0569581985474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.7198433876038, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.67568063735956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.9483199119568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.3017654418945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.7483205795288, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.5814366340637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8699178695679, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.1345596313477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.4087963104248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.3142371177673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.8596787452698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.5419187545776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.4558439254761, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.9710369110107, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.161759853363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.0516781806946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.2230386734009, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5481648445129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.3644776344299, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.406081199646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.2358388900757, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.8279943466187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.7825598716736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.4593605995178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.5555186271667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.3673620223999, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.1907229423523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.0174431800842, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.470561504364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.4100794792175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.6692814826965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.7447986602783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 663.4708762168884, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.8035154342651, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.7553558349609, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.5339183807373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.3369617462158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.0831990242004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.6193590164185, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.316478729248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.8935956954956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.8748788833618, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.6583957672119, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.2356820106506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.2271971702576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.1395163536072, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.2142362594604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.526713848114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.6163196563721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.042558670044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.1929664611816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.3739161491394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.8544034957886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.5081577301025, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.6859254837036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.7687983512878, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.903196811676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.1617588996887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.0119996070862, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.4231996536255, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.7622404098511, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6711988449097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.9763188362122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.2238402366638, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.5916819572449, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.0360078811646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.9438409805298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.898886680603, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.6342406272888, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7265605926514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.2513599395752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.3668761253357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.036322593689, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.0531187057495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.967200756073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.4104042053223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.6084780693054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.4782409667969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3795185089111, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.6808032989502, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.0079989433289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.4396843910217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.9755210876465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.0873599052429, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.6600017547607, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4667267799377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.7835211753845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.511522769928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.9318370819092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.8947186470032, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.505922794342, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.5931210517883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.5510406494141, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.2048034667969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.8928046226501, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.3248019218445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.7961616516113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.4975981712341, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5804858207703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.3884830474854, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.4143953323364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.9025626182556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.9694428443909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.1732788085938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.5456023216248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.04816198349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.990716457367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.1780805587769, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.9928021430969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.8092789649963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.7881650924683, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.4100842475891, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.0059237480164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1144.3814420700073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.7180852890015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1148.235206604004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.495837688446, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1151.7169618606567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.9108786582947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1154.2894315719604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.6063995361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.2542419433594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.8588809967041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.8124809265137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.4609594345093, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.8584027290344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.3590393066406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.7190327644348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.6623978614807, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.488000869751, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.9628767967224, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.197274684906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.1811203956604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.4787158966064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.7526388168335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.5280017852783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.4454426765442, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.6332874298096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.001437664032, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.0849566459656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.4868807792664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.8779172897339, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.4171242713928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.8990454673767, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.05215883255, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.9527988433838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.4622392654419, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.4916753768921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.3515167236328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.8542385101318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.0782413482666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.0983963012695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.3134436607361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.5363283157348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.1740880012512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.1343951225281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2897567749023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.4091172218323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.1241555213928, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.6756825447083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.6694359779358, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.52176332473755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4608058929443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.0254459381104, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.8531193733215, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.7758402824402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.1686396598816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.019202709198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.7179160118103, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.6121602058411, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0048007965088, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.6948852539062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.2860770225525, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.8935966491699, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.7124810218811, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.154399394989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.7255954742432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.811683177948, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8049640655518, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.7662420272827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.0038361549377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.4740796089172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.032799243927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.7988796234131, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.2593545913696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.94864320755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.9268846511841, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.0091166496277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.4808011054993, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.3547253608704, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.0809607505798, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.5339198112488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.300802230835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.301281452179, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.756317615509, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.2585587501526, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.8273596763611, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.1979155540466, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.6921591758728, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.1489591598511, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.8887991905212, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.9512014389038, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.1457605361938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.2275233268738, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.5281620025635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.7999978065491, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.8827204704285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.3004789352417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 697.3012828826904, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.7313613891602, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.3812766075134, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.387686252594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.3596749305725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.005443572998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.109920501709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.8323173522949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.567684173584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.312162399292, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.4390487670898, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 871.0507202148438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.6833591461182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 786.16783618927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 797.6203179359436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 650.427520275116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.6073598861694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.4502382278442, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.580641746521, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.9768013954163, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 791.4603209495544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.9311981201172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.8056135177612, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 868.8295888900757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.5865564346313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.623366355896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 794.4684815406799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.7107300758362, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 863.1739234924316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.4688024520874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.9454379081726, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 792.3839998245239, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.8836889266968, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.4883232116699, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.893753528595, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.0329637527465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5475211143494, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2343997955322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.5740795135498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1735982894897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.0507230758667, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.6592001914978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.63615894317627, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.43184661865234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.9995174407959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.7878384590149, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.33216381073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.2516784667969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6526412963867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.9236760139465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.9419240951538, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1302423477173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.0460858345032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.3809537887573, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6100778579712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.390073299408, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6424036026001, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.71856021881104, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.370080947876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.663996219635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.4319982528687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.4270358085632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1572771072388, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.6132822036743, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.001763343811, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.0502376556396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.2239990234375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.7252779006958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.8761615753174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.212480545044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.2411203384399, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.6987233161926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.2334442138672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.9057574272156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.7734408378601, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.7116866111755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.1684803962708, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.4588813781738, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.2193641662598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.5753588676453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.8507256507874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.5831990242004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.91583776474, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.0723218917847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.6406421661377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.1825585365295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.177761554718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.8227186203003, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.2686395645142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.3761596679688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.694239616394, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.731207370758, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.8703980445862, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.6918387413025, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.6753611564636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.781277179718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.4887948036194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.0614433288574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.7217597961426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.9131212234497, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.784637928009, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.2627205848694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.5252766609192, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.6939206123352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.7620787620544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.3003196716309, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 774.2734432220459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.3854374885559, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.4878416061401, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.5934376716614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 767.8214430809021, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.0168042182922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.9539203643799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.4361577033997, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.73648166656494, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.8363256454468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.4070334434509, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1123223304749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.7001643180847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1643223762512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.7086415290833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.0047979354858, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.4697570800781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6187181472778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.414234161377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.5281586647034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.5145573616028, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.5441589355469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.1076803207397, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.5953569412231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.3011207580566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.781596660614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.1494436264038, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.1592030525208, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.0659184455872, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.4612803459167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.9817600250244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.356960773468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.6849637031555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.3753528594971, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.2321605682373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.7380752563477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.740475654602, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8348798751831, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.3731231689453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7470369338989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.2252793312073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.5422420501709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.1472010612488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4108815193176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.8107171058655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.8635206222534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.1417646408081, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.7251200675964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.0907163619995, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.9886436462402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.8934426307678, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5759978294373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.6726403236389, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.4998464584351, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.4041676521301, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1076.6366386413574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.8084797859192, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1079.6487951278687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.598714351654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1082.57408618927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.8479986190796, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1088.6448001861572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.7835149765015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.5625591278076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.8921551704407, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.575840473175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.6927995681763, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.4260811805725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.15808248519903, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.065279006958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.5601649284363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.2265601158142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.6760034561157, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.738558769226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.4280014038086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.5155172348022, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.4579191207886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.2415995597839, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.5931262969971, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8719964027405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.5687975883484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.923360824585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.0590362548828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.4257607460022, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.8180747032166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.4412860870361, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.9108805656433, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.6484794616699, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.3580794334412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.2320036888123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.8339157104492, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.9756836891174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.5992031097412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.0547204017639, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.9952020645142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4049587249756, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.0150356292725, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.4175987243652, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.8760032653809, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.882246017456, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.6726403236389, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.9423985481262, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.2865600585938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.254554271698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.7393655776978, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.3375973701477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.9926424026489, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.2015962600708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.0145578384399, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.8340797424316, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.0331211090088, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.14319896698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.2731223106384, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.5272002220154, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.6798377037048, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.8075256347656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.1019263267517, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.7054409980774, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.764479637146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.3817558288574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.2521662712097, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.5632004737854, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 663.4691143035889, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.9612817764282, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.2305612564087, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.7356758117676, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.3982396125793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.8871955871582, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 749.4414401054382, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.564001083374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.2438387870789, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 774.2369604110718, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.3542394638062, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.6750478744507, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.8524780273438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.4099254608154, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.1587224006653, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 771.065924167633, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.6823945045471, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 650.2564787864685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.3151965141296, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.4660768508911, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 759.3793654441833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.9107146263123, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 769.486403465271, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.2800006866455, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.218403339386, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.6131181716919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 758.4811210632324, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.8339152336121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.0292820930481, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 996.4808034896851, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.601279258728, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 992.7102375030518, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.5784053802491, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1005.8182382583618, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.4287977218628, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1006.7497634887694, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.1715202331544, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.5576019287109, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.6020832061768, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.6030349731445, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.2105631828308, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.1219258308411, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.1479992866516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.8887987136841, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.1942348480225, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.7524819374084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.6739249229431, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.3076763153076, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.4382433891296, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.2767949104309, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9227228164673, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.2659244537354, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.2568025588989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.70383644104, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.4332799911499, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.380156993866, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.9524827003479, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.2622423171997, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7368021011353, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.5859236717224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.0787234306335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.0035157203674, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.995840549469, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.1692771911621, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.7068791389465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.1044821739197, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5995163917542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.7801637649536, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.5883226394653, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.4103960990906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.3209552764893, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.2929592132568, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.0465593338013, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.4545550346375, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.3747138977051, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.3230395317078, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.5823993682861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.5529594421387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.3535962104797, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.3708744049072, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.6663956642151, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.5982398986816, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.0641598701477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.6276807785034, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.7433590888977, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6271982192993, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.0913624763489, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9884777069092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8231983184814, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.1590437889099, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.7718362808228, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.0752058029175, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.4724817276001, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.4711985588074, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.7331275939941, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.2932796478271, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.2630400657654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1612801551819, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.3769550323486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.9544053077698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6295971870422, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.9455981254578, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2756862640381, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.4188771247864, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.3984022140503, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.3758397102356, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.0684819221497, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.5692868232727, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4145693778992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.6209635734558, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.2206382751465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.5919995307922, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.1087942123413, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.7662410736084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.675359249115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.3584036827087, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.3766374588013, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.719202041626, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.6025557518005, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.5700764656067, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.8479995727539, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.3670401573181, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.6673564910889, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.5814433097839, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.0843200683594, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 942.0443248748779, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 948.3080005645752, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.94225025177, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 830.9920024871826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.7846536636353, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.841588973999, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.9993591308594, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 828.1028842926025, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.4806480407715, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.6609582901001, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 824.8297595977783, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 834.2294263839722, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.6097621917725, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.8777656555176, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.3409585952759, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.2539157867432, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 943.7436819076538, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 953.3852815628052, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.9041595458984, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 834.291672706604, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.3051252365112, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 935.8153629302979, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.9539127349854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.5467138290405, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 945.5777549743652, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 958.6444711685181, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 828.9371252059937, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 836.2817716598511, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 940.1252841949463, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.4692897796631, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.5091238021851, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 827.9468774795532, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1188.7638425827026, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1139.0116786956787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1192.6312065124512, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1144.9139213562012, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1196.6630458831787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1148.8268852233887, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1199.1324853897095, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1151.2102365493774, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.3561611175537, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.2972812652588, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.8870348930359, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.4310455322266, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.9700808525085, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.2256026268005, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.1779141426086, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.4120001792908, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.2089548110962, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.3401656150818, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.3679986000061, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.8452777862549, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.056960105896, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 663.0862379074097, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.4846396446228, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.0979180335999, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.2030415534973, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.6873650550842, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.217921257019, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.5999975204468, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.6790399551392, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.2347226142883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.4379177093506, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.4911932945251, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 893.0113649368286, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.9203195571899, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 897.3352003097534, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.5142402648926, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 903.1423902511597, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.7889585494995, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 904.4144010543823, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 769.8015999794006, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.8886394500732, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.925440788269, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.5025572776794, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.8607997894287, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.2179217338562, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.1444764137268, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.8600015640259, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.758243560791, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 794.1782355308533, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.2632012367249, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.0107216835022, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.5444741249084, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 796.6763210296631, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.3857679367065, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 716.9395160675049, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.1494383811951, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2539.4411087036133, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.7367973327637, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2552.7148723602295, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.6376004219055, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2568.1479930877686, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.1838355064392, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2573.318395614624, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.6927938461304, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1952.103033065796, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1359.0283298492432, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1958.6380863189697, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1372.5860738754272, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1959.9724960327148, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1364.3467140197754, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1967.141752243042, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1370.417766571045, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1964.930076599121, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1362.9128122329712, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1967.912302017212, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1379.5636749267578, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1965.8495903015137, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1365.1654386520386, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1970.8820724487305, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1378.2129621505737, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1868.5980892181396, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1193.9454507827759, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1866.2722873687744, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1197.3190450668335, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1872.7472019195557, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1199.8555278778076, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1877.0246410369873, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1198.0375957489014, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7500.65071105957, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 968.5006332397461, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7496.341361999512, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.2843151092529, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7503.418045043945, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 981.8358421325684, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7526.106185913086, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 985.64528465271, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1259.6371126174927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1229.0788793563843, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1267.1767902374268, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1338.947515487671, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1885.2336120605469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1829.1235160827637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1836.0921669006348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1887.8436851501465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1254.809913635254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1235.1792001724243, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1267.8843212127686, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1330.5446434020996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1872.5360107421875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1859.3099308013916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1874.436321258545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1921.0147285461426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1252.7068710327148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1234.2902421951294, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1268.9958429336548, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1333.0531215667725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1888.139820098877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1859.1505718231201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1876.254072189331, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1926.902084350586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1247.8775882720947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1232.8001594543457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1263.5199975967407, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1334.09423828125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1880.45503616333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.8167896270752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1862.0091438293457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1917.6316738128662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1258.6678314208984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1342.611198425293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1482.4416017532349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1720.6641674041748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1856.96928024292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1815.0046348571777, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1933.269920349121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2183.1913566589355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1266.679196357727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1325.452470779419, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1475.4838466644287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1707.745590209961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.832468032837, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1815.1012802124023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1931.3668727874756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2174.868803024292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1266.0057735443115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.5030374526978, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1480.5196857452393, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1712.7619075775146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1853.2225608825684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1816.4598369598389, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1924.2225646972656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2175.711679458618, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1263.549599647522, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1326.150245666504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1473.4009742736816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1709.3239879608154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1847.5390338897705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1811.7511940002441, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1914.3512153625488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2152.9620838165283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1762.9961681365967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1973.5915184020996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2806.9182205200195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2885.530414581299, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2226.8295860290527, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2266.8059253692627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3211.6020488739014, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3295.11137008667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1795.231056213379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1960.2460670471191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2831.8447971343994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2919.1649436950684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2226.222267150879, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2253.9065551757812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3214.8178005218506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3310.1345825195312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1809.108648300171, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1947.1249389648438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2841.1847972869873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2937.2872066497803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2229.363498687744, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2236.3116931915283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3219.3096256256104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3315.979804992676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1795.524492263794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1951.464958190918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2837.598237991333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2925.0332736968994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2229.7761631011963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2237.8563117980957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3218.9145374298096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3316.699962615967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1049.2107200622559, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1071.2718439102173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1108.029751777649, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1229.4878387451172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1170.725440979004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1114.7571182250977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1149.4355154037476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1250.3031921386719, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1021.492476463318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1028.8374376296997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1086.8889474868774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1192.2465658187866, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1185.2846431732178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1101.3500928878784, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1133.2726430892944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1232.7007913589478, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1021.5696096420288, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1029.5265626907349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1080.1041507720947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1189.688959121704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1181.3865518569946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1097.504644393921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1119.9983930587769, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1234.8043298721313, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1015.2828788757324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1029.4039916992188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1082.4987173080444, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1186.7692804336548, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1171.7518424987793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1090.1497650146484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1125.0076913833618, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1230.3971242904663, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1098.7814378738403, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1290.2457666397095, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1674.1484928131104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1704.6199893951416, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1201.496000289917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1266.765432357788, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1693.1745529174805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1749.5990371704102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1083.1881666183472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1251.7452764511108, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1550.8132791519165, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1621.9798517227173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.3945455551147, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1232.2880029678345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1679.4268608093262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1753.7819194793701, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1083.342866897583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1248.0281591415405, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1551.6948699951172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1635.6678581237793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1207.8156757354736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1223.6540842056274, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1672.1687984466553, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1775.3828811645508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1082.9936027526855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1243.1292724609375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1542.17520236969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1629.67679977417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1205.3932809829712, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1218.94287109375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1667.2649574279785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1758.2412910461426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1728.3876705169678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2251.2641620635986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1474.0286493301392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1972.3675155639648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1675.532627105713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2198.624143600464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1452.8283262252808, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1999.5867156982422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1690.782871246338, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2193.932647705078, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1441.0691213607788, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.0332870483398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1685.5443286895752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2194.9545574188232, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1442.394404411316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1977.8499126434326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 970.0470399856567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 997.0609664916992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1072.8740692138672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1248.5355234146118, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1031.5075254440308, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 984.0806341171265, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.7641525268554, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1198.475193977356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 950.4982328414917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.9567937850952, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1019.9463891983031, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1141.3403129577637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1027.8775930404663, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 981.6134405136108, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1007.3108768463135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1205.0265645980835, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 951.2828826904297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 980.0927972793579, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.1099262237549, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1140.5527973175049, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.2049512863159, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.2481517791748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1009.7663974761963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1197.27135181427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 941.0833597183228, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 970.2332782745361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1016.3004875183105, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1138.6312007904053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1021.0259103775023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 974.6134424209595, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1001.2521648406982, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1199.042239189148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1044.4852781295776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1448.9542436599731, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1472.3787260055542, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1119.3969535827637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1284.0489721298218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1319.4139194488525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1030.494885444641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1308.4395265579224, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1311.0385608673096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1094.4731187820435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1207.2350454330444, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1228.1020879745483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1037.1731233596802, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1300.8471965789795, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1303.5825490951538, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1095.0407981872559, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1202.0291137695312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1237.1227169036865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1028.5054445266724, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1298.2577562332153, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1289.831042289734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1085.804796218872, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1204.1727924346924, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1224.199833869934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2830.60001373291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1908.4585571289062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2821.8806552886963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1704.8300552368164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2823.439989089966, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1700.1129627227783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2822.421417236328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1693.81760597229, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1055.8260822296143, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1122.2932720184326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.8988733291626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 994.9108743667603, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.9841585159302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1003.8108825683594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1029.577922821045, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1030.7419157028198, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1022.4097585678101, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.6831941604614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 966.933913230896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.3617658615112, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.6419191360474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.8609580993652, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1015.1004791259766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.5182361602783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 977.7567958831787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.3683137893677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1025.8622407913208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.6502466201782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1003.0964756011962, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 964.716968536377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 967.7892780303955, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.720009803772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1538.6998558044434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1461.3667249679565, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1401.9203281402588, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1329.8638439178467, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1414.4491243362427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1319.8528003692627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1392.3977613449097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1304.7816038131714, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 944.4953536987305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 921.1899089813232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.8500728607178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1027.9425525665283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1248.3094453811646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1144.661283493042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1161.59423828125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1230.2307224273682, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.8316783905029, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 934.6414566040039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 982.3657655715942, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1043.7963247299194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1224.8020887374878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1137.7555227279663, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1173.082389831543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1222.1680116653442, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 953.5150289535522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 942.4519872665405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 981.3996696472168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.058879852295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1221.3108777999878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1138.8548803329468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1172.755208015442, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1220.6654262542725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.6423978805542, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.8123188018799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 983.2894420623779, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1055.212163925171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1218.5444736480713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1145.8080053329468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1168.4464025497437, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1225.6032037734985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1073.4163188934326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1026.2518405914307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.492483139038, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1659.54110622406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1236.7595148086548, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.9014434814453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1295.489592552185, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1745.9214115142822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1094.9641704559326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.546877861023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1164.395203590393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1676.9783973693848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1230.4492855072021, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1154.3604850769043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1276.6979217529297, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1756.736183166504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1098.8934421539307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1039.5990371704102, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1177.2708749771118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1674.8657703399658, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1216.3171243667603, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1154.558401107788, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1281.4539241790771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1770.5393695831299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1098.1537628173828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1038.7574291229248, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.7703981399536, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1675.3443336486816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1240.3358364105225, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1163.2436847686768, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1287.349443435669, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1772.5182437896729, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1483.6267232894897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2058.093433380127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2138.203344345093, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1488.0023956298828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1934.7883033752441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2011.0975837707522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1501.5256023406982, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2074.6416091918945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2159.3063926696777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1523.474555015564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1944.3254375457764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2013.9351940155027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1496.3699197769165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2065.676803588867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2159.945125579834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1523.845772743225, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1939.9772930145264, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2017.0648097991943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1509.3516778945923, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2076.5209579467773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2157.4324703216553, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1532.1574449539185, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1928.3809661865234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2026.8329524993899, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.9905681610107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 782.8438425064087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.8652782440186, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.9108839035034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 912.6281642913818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.8228783607483, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 870.9828805923462, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 918.4670352935791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.9142417907715, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 785.466878414154, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 791.5721607208252, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.8796939849854, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 908.6244821548462, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 840.716962814331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 859.8980808258057, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.2230348587036, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.012638092041, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.0084824562073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 800.4503989219666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.0716829299927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 910.9155225753784, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 830.6192016601562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.8899230957031, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.5508832931519, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 838.1491088867188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.7686381340027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.2316780090332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 857.2512054443359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 916.290397644043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.8497676849365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.4774417877197, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.3064050674438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.7127981185913, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 904.6132898330688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1161.0614252090454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1191.8912029266357, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.157917022705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 903.1121587753296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1200.6313610076904, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1231.362886428833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1025.4160022735596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 912.3614358901978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1153.566074371338, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1203.8488101959229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1051.1552000045776, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 904.2190456390381, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.6700716018677, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1228.632001876831, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.8041677474976, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 906.7972755432129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1155.9976053237915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1208.2734441757202, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.1409721374512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 908.6545562744141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.9870443344116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1229.4998359680176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.619517326355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 907.6220798492432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.5985660552979, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1203.9919996261597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1051.995997428894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 899.6364688873291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1172.9750490188599, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1224.14559841156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1271.17600440979, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1871.4967823028564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1560.0763273239136, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1430.594882965088, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1283.8611221313477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1842.8488159179688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1568.6601543426514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1429.5713520050049, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1279.6668720245361, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1839.9696063995361, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1567.2684717178345, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1429.3268775939941, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1280.3593587875366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1837.193603515625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1569.7999954223633, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1424.4983959197998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.4009618759155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.6667165756226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 884.2187213897705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 877.4310350418091, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.7758502960205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 748.5017585754395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 858.6201667785645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 867.6632070541382, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.961916923523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.5412788391113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 884.1987133026123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 871.1969566345215, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 884.7262334823608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.1065592765808, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 873.3092784881592, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 867.4163246154785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.946400642395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.021915435791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 888.6913728713989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.2224073410034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 885.1534366607666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.3604822158813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 877.9908752441406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.2334442138672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.2582406997681, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.397442817688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.8137636184692, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.7820701599121, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 881.8862390518188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 749.9992036819458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 868.6065578460693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 864.4028854370117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 920.4859161376953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1009.4046401977539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 974.5896053314209, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 964.6912050247192, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 895.863676071167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 994.3841505050659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.8822498321533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.9955263137817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 900.0070285797119, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 996.6476678848267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 991.2620830535889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 953.3940839767456, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 897.6777648925781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1000.6913566589355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 990.504322052002, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 954.9004793167114, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2263.038558959961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1323.576636314392, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2301.4603233337402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1264.331521987915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2302.694854736328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1267.673602104187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2307.870569229126, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1258.8591957092285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 787.6683187484741, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 817.1161603927612, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.6371250152588, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 981.700005531311, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 736.1006379127502, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.3566384315491, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.0086398124695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 817.7670288085938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.150707244873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.6063966751099, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.4759993553162, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.3863978385925, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.251359462738, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.1078481674194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.7395105361938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1034.12832736969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.8644785881042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.936324596405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.0902433395386, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 815.7070446014404, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.5142450332642, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1038.333592414856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 759.4526362419128, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.155993938446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1339.7724771499634, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.555835723877, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1346.8232107162476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 946.7956829071045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1346.158561706543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 943.0287933349609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1353.8382387161255, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 941.4694404602051, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 845.9726428985596, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 775.5967998504639, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 841.9164848327637, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1079.2651176452637, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.840163230896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 870.3110456466675, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.1449527740479, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1197.0763111114502, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.4366397857666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.570240020752, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 856.6934299468994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1082.4855852127075, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.4817638397217, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 880.2841663360596, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 948.8760042190552, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1192.2329568862915, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 870.0771236419678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 813.8033580780029, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 858.876805305481, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1093.7046384811401, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 942.4694442749023, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.0011186599731, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 945.2363300323486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1203.0700874328613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 869.0003156661987, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.8844747543335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.8700838088989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1091.2603187561035, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.47057056427, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 878.6203193664551, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 944.4849634170532, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1203.243522644043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1033.3436679840088, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1207.5406312942505, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1173.7708759307861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1192.347526550293, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1320.9932851791382, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1330.1750421524048, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.3980731964111, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1230.4545593261719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1167.5486421585083, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1196.5935945510864, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1326.1436891555786, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1356.0465621948242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1045.1977586746216, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1235.4233598709106, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1178.8600063323975, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1208.9457607269287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1353.184962272644, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1351.0628843307495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1053.8259315490723, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1235.1115226745605, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1184.0790367126465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1202.9614400863647, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1341.5307140350342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1351.109766960144, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1668.6043167114258, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1701.7046356201172, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1702.029619216919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1759.364309310913, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1714.9470329284668, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1757.2215843200684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1700.3249549865723, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1744.2187118530273, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.0782399177551, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.1671957969666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 817.7572774887085, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 794.7724795341492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.5425634384155, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.604642868042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 896.964168548584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 887.2455978393555, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.5985608100891, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.5459160804749, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.7412910461426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 792.7508807182312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.1131205558777, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.0655989646912, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 887.9617643356323, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 883.8465738296509, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.147358417511, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.3033547401428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 802.4652886390686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 786.8052816390991, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.2721586227417, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 701.9630408287048, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 890.749921798706, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.4563150405884, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.8739199638367, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.4007964134216, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.9575939178467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.403844833374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.8159928321838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.9315156936646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 879.440803527832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.3428821563721, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1108.8305568695068, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.3284816741943, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1241.6048002243042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.4681720733643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1107.2003173828125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 881.5844631195068, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1247.4438428878784, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.5788803100586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1107.2967958450317, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.1123237609863, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1258.9489603042603, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.6195116043091, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1114.2772769927979, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.8384037017822, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1256.9084692001343, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.7646503448486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.7620806694031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.1451177597046, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.1793661117554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.5108728408813, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 689.3284797668457, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.8601622581482, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 650.4443168640137, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.124801158905, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.8396754264832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 650.0739216804504, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.7012825012207, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.0360040664673, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.6044821739197, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.7734422683716, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.2761583328247, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.8214454650879, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.6916809082031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.6492805480957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.2700819969177, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.9756808280945, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.0694398880005, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.4041619300842, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.0518345832825, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.7038397789001, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 864.3414306640625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.0956792831421, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 876.8566417694092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.1678419113159, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.2503900527954, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 848.4531307220459, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 885.897912979126, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.7019271850586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 847.0667171478271, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.1308836936951, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.3103971481323, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.1363224983215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 838.4364652633667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 716.8809580802917, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.9019193649292, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.8057599067688, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.3824005126953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1066.811842918396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 906.6174411773682, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 899.2847967147827, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 974.1414451599121, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1033.1201601028442, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 884.8302412033081, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 883.7942409515381, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.6883153915405, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1063.9334392547607, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 912.336163520813, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 893.0531215667725, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 980.0006341934204, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1033.5857677459717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 890.5939102172852, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 878.7131214141846, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 996.466236114502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1067.798252105713, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 921.65696144104, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 902.8363132476807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 982.524471282959, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1036.8795156478882, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 897.1425676345825, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.0582437515259, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1003.8440036773682, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1072.005763053894, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.3512048721313, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 904.8819208145142, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 988.6751985549927, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1035.290560722351, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 903.3412790298462, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 886.4716863632202, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1329.116153717041, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1283.4150457382202, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1339.9849557876587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1289.2715072631836, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1342.231035232544, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1291.8219137191772, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1349.4299173355103, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1297.330241203308, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.3017587661743, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.5856022834778, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.9895949363708, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.56720495224, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.6203184127808, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.1225638389587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.2948794364929, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.2297639846802, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.5073585510254, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.5872006416321, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.2993545532227, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.3057541847229, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.454562664032, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 749.0086364746094, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.6479978561401, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.7281637191772, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.5769605636597, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.1873574256897, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.3672018051147, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.7321577072144, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.7396841049194, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.5985593795776, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.3025612831116, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.0387234687805, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 969.3411254882812, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 859.346079826355, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.6574430465698, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.6302471160889, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 967.3595142364502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 857.2313642501831, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 988.6204767227173, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 868.9715194702148, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.5121660232544, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 697.481279373169, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 774.8774480819702, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.3443264961243, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 858.4241580963135, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.9556841850281, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 781.6708827018738, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.3571176528931, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.6891202926636, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.0371189117432, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 784.1585612297058, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.6140828132629, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.3204832077026, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 721.0715198516846, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 796.3966393470764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.4879965782166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2717.9745769500732, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.3230409622192, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2699.128303527832, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.7743978500366, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2709.370880126953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.9462385177612, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2731.365451812744, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.8323197364807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2076.4281463623047, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1457.1595239639282, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2097.941131591797, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1478.0262327194214, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2083.8921642303467, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1455.8040046691895, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2107.654552459717, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1482.8478288650513, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2091.898708343506, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1463.9102411270142, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2113.7078285217285, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1491.2033605575562, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2095.1006412506104, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1478.0684804916382, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2118.473119735718, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1499.1064023971558, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1905.901107788086, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1274.986081123352, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1906.136178970337, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1281.6697692871094, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1920.943193435669, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1285.9129619598389, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1945.6445026397705, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1290.278401374817, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7693.116188049316, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1036.5203142166138, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7675.943145751953, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1045.0107145309448, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7711.6047286987305, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1043.6899137496948, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7772.8369140625, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1056.410574913025, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3398.1664276123047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3375.6281661987305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3467.9852867126465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3659.65389251709, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4979.245738983154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4937.603549957275, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4985.6013107299805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5119.582862854004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3359.736328125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3379.3026161193848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3461.6123008728027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3615.9483337402344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5002.736167907715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5040.7399559021, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5092.132034301758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5243.2403564453125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3316.3067054748535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3369.159393310547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3449.480667114258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3598.1529426574707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5027.1360206604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5043.435821533203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5120.320644378662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5268.351173400879, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3298.319206237793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3358.3537673950195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3430.9513664245605, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3578.1862449645996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5025.854225158691, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5059.779815673828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5117.811489105225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5271.095542907715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3483.228645324707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3669.793472290039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4052.63614654541, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4730.128307342529, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4969.440116882324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4904.541282653809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5213.0596923828125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5982.516326904297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3422.6590156555176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3600.3073501586914, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3924.7011375427246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4625.71403503418, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4952.659015655518, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4925.298900604248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5185.362358093262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5933.870868682861, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3395.7278442382812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3580.5932998657227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3956.465129852295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4595.521926879883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4958.760833740234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4938.946552276611, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5199.3218994140625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5954.500961303711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3378.9064407348633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3558.86287689209, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3929.863815307617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4564.4440269470215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4956.67293548584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4910.14892578125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5218.565444946289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5984.881420135498, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4829.424667358398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5433.060321807861, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7806.777114868164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8017.152519226075, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6049.528961181641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6209.582901000977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8967.93815612793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9176.490364074707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4776.638412475586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5285.107326507568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7859.498329162598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8068.514213562011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6004.361743927002, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6197.78959274292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8971.304893493652, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9194.435691833496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4811.991539001465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5260.808944702148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7882.068824768066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8095.280799865723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6053.629627227783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6205.10046005249, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8991.76399230957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9224.360389709473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4807.774562835693, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5241.690616607666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7886.771087646484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8113.81248474121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6043.034381866455, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6202.785606384277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9000.541343688965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9221.157264709473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2861.185464859009, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2899.6649742126465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3039.9782371520996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3371.622085571289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3189.772663116455, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3074.2504024505615, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3152.0294284820557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3387.3132133483887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2674.692335128784, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2741.5833473205566, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2873.1006240844727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3182.318878173828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3165.7116985321045, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2923.6572647094727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2999.93070602417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3241.220169067383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2631.664161682129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2701.073589324951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2865.827522277832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3164.26176071167, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3143.0513763427734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2881.092004776001, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2998.9297771453857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3238.419075012207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2610.135660171509, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2679.1004943847656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2848.853931427002, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3141.5671825408936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3113.8391971588135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2874.32222366333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2992.4087810516357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3231.563186645508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2998.3241748809814, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3538.88126373291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4541.369152069092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4591.915645599365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3285.662250518799, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3416.860828399658, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4514.179973602295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4756.4812660217285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2927.3929595947266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3329.757614135742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4065.9684944152837, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4292.513599395752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3211.86767578125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3192.1806144714355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4474.955062866211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4675.568618774414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2908.5465717315674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3317.319164276123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4074.192638397217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4305.4304122924805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3193.121757507324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3178.0118560791016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4489.347724914551, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4689.27282333374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2889.308786392212, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3298.177604675293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4065.5955123901367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4299.972667694092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3173.1617736816406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3168.003349304199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4484.960670471191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4688.771991729736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4707.722549438477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6129.013919830322, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3971.7742919921875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5325.346088409424, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4491.353759765625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6070.331707000732, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3809.7813034057617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5318.385791778564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4495.207462310791, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6050.131034851074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3848.9816665649414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5319.352264404297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4480.509128570557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6053.228282928467, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3830.306911468506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5331.415042877197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2645.3375911712646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2740.9164905548096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2942.057590484619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3396.282215118408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2791.8868732452393, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2684.8777770996094, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2787.807502746582, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3316.223030090332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2545.5401611328125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2587.7246475219727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2703.953561782837, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2969.3715286254883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2682.7446365356445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2565.7172775268555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2704.6086502075195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3180.517120361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2505.807695388794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2560.28302192688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2683.8115215301514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2972.8806495666504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2659.173765182495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2542.5372886657715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2683.1230449676514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3173.1118488311768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2492.860326766968, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2540.917615890503, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2672.7654552459717, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2966.6897583007812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2651.095190048218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2526.7064094543457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2670.849094390869, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3172.3654556274414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2822.297592163086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4021.46240234375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4006.723804473877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2985.1467418670654, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3496.418914794922, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3583.0467224121094, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2690.296154022217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3275.2033615112305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3365.4108810424805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2827.7331161499023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3144.350709915161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3245.333938598633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2664.6996688842773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3274.598560333252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3357.90225982666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2823.8379096984863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3141.2028789520264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3239.948310852051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2656.415672302246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3273.0588912963867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3344.326515197754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2819.5796871185303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3142.66752243042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3245.0380897521973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7468.764305114746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5136.006107330322, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7173.5515213012695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4534.069080352783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7207.119598388672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4524.626083374023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7209.914016723633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4528.137454986572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2775.597610473633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2904.4750022888184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3118.497905731201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2689.8905754089355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2620.643539428711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2736.3576126098633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2605.3681564331055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2575.7755088806152, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2597.467851638794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2484.5435333251953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2382.88911819458, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2440.0817489624023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2588.324022293091, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2577.5216102600098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2589.388647079468, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2428.8852882385254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2383.675193786621, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2429.655990600586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2588.593759536743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2573.782091140747, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2580.9030532836914, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2406.85152053833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2379.979200363159, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2426.647367477417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3961.3167762756348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3724.070415496826, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3542.637462615967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3252.7791786193848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3569.930839538574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3279.5262145996094, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3582.5556564331055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3281.7317962646484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2374.9667358398438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2363.015537261963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2469.046573638916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2636.2950801849365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2933.1019020080566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2786.2177658081055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2857.806558609009, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2981.432809829712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2458.396472930908, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2414.6208000183105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2538.7096214294434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2695.8969402313232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2894.0816020965576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2809.7070503234863, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2849.927349090576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2975.6071949005127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2468.083028793335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2424.6228790283203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2539.232635498047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2698.7817764282227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2902.682867050171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2804.1208171844482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2862.282419204712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2978.3506965637207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2473.724822998047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2419.9643230438232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2534.3817615509033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2698.1713676452637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2899.4129753112793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2807.589912414551, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2855.5313682556152, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2981.5920066833496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2663.2795238494873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2625.1976203918457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2965.526885986328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4074.348964691162, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2983.815870285034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2827.1772956848145, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3149.468011856079, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4402.744312286377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2756.571521759033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2653.8516807556152, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2971.174077987671, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4121.101474761963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3044.738073348999, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2848.9119720458984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3129.304962158203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4454.999847412109, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2760.1497554779053, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2649.9431800842285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2973.3580684661865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4139.254055023193, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3044.8459148406982, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2856.2953662872314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3123.3929443359375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4458.900909423828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2767.0831966400146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2652.226400375366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2967.517442703247, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4114.990711212158, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3038.4968090057373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2857.829761505127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3135.0531005859375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4468.111057281494, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3609.7054481506348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5193.585109710693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5422.665119171143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3643.6180877685547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4903.718891143799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5064.913959503174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3677.795524597168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5217.720584869385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5441.573429107666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3727.468032836914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4881.699199676514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5077.261753082275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3680.0318336486816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5218.181610107422, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5454.44128036499, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3750.2574729919434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4878.147830963135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5091.23104095459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3680.569267272949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5216.019382476807, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5455.9124755859375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3753.722038269043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4876.273937225342, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5101.654090881348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2073.2859230041504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1901.355218887329, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1954.589605331421, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2138.2076930999756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2206.1382484436035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2030.9395313262942, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2087.3702430725098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2266.5113735198975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2087.238712310791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1904.84769821167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1958.400478363037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2128.4636878967285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2216.427354812622, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2033.6316871643066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2086.618871688843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2273.0257987976074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2082.1815967559814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1904.2384147644043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1954.8950290679932, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2130.430564880371, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2203.6538982391357, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2035.9676837921143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2082.168016433716, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2284.886884689331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2079.059371948242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1902.9017639160156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1953.128957748413, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2125.4708862304688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2205.1587295532227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2034.4441604614256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2079.591999053955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2274.364004135132, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2535.150566101074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2259.431505203247, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2845.5051136016846, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2914.4007873535156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2463.1679821014404, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2234.590082168579, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2912.757921218872, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3021.9435024261475, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2569.222402572632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2258.2121562957764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2799.4305419921875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2899.363832473755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2474.388484954834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2201.64927482605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2873.486557006836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2988.3344078063965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2560.309133529663, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2258.278570175171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2794.201774597168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2906.28849029541, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2477.816162109375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2197.915687561035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2874.9288177490234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2992.1313762664795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2554.886713027954, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2246.7395210266113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2799.214868545532, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2908.0331134796143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2491.225748062134, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2195.72735786438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2878.4177589416504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2991.6307163238525, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3188.3969688415527, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4585.373268127441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3781.713581085205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3472.1510696411133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3110.698719024658, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4583.545951843262, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3819.6694374084473, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3483.972969055176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3115.3204822540283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4617.675189971924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3830.5409622192383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3495.5118560791016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3122.009925842285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4619.451847076416, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3831.1009216308594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3493.672504425049, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1774.6553707122803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1732.4222660064697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2023.433427810669, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2105.2148723602295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1926.328945159912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1747.3659229278564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2028.6406517028809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2068.9443016052246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1777.6380729675293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1697.9140949249268, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2001.6564655303957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2006.9626998901367, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.6582489013672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1727.123498916626, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2048.303689956665, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2064.1391944885254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1765.5036926269531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1697.0948791503906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1996.7046356201172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2014.450874328613, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1986.7118167877197, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1724.1163158416748, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2043.101444244385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2070.168466567993, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1767.9752159118652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1690.178565979004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2001.7927932739258, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2017.6481437683105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1986.7012786865234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1716.0296058654785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2056.33056640625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2071.194849014282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2280.185432434082, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2381.10463142395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2327.71183013916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2345.991849899292, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2227.5305461883545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2355.4878520965576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2318.7060832977295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2318.8564682006836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2210.6049728393555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2354.4163131713867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2321.523332595825, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2322.78431892395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2207.3196983337402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2352.5390243530273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2321.721782684326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2322.556962966919, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5269.579048156738, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3226.089630126953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5280.382270812988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2994.468011856079, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5306.164970397949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2998.841257095337, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5340.772190093994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3000.4952144622803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1736.9003009796143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1817.867841720581, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.2942428588867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2204.0719985961914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1571.3019180297852, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1676.655511856079, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1669.9276638031006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1778.340015411377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1801.4459037780762, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2299.9059009552, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1633.461594581604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1641.5945720672607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1657.5297832489014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1788.0214500427246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1803.341121673584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2307.8664016723633, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1628.8083171844482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1653.2007884979248, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1656.5063953399658, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1783.9534282684326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1796.2208080291748, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2300.9156608581543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1624.0582466125488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1649.1795063018799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3047.9569721221924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2316.324167251587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2988.1812858581543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2133.7673664093018, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2990.3555488586426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2142.683343887329, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3006.4640140533447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2144.823989868164, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1968.0879878997803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1843.1980800628662, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1965.6065464019775, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2297.1195220947266, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2031.2476921081543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1970.7147216796875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2077.2993659973145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2520.996160507202, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2059.16880607605, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1986.493787765503, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2088.6003398895264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2350.47438621521, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2114.6878242492676, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2054.633913040161, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2168.411512374878, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2555.338888168335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2078.6731338500977, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1988.6934280395508, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2088.3039951324463, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2353.251190185547, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2109.0505695343018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2056.8019104003906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2167.594585418701, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2577.12495803833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2084.486885070801, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1988.7305450439453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2086.6374492645264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2370.1675128936768, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2110.013608932495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2056.52174949646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2166.485776901245, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2594.452476501465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2260.727834701538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2539.962863922119, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2508.5123252868652, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2491.8990516662598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2810.4096031188965, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2869.3971157073975, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2418.5827255249023, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2629.3460750579834, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2641.023349761963, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2531.2494373321533, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2830.636339187622, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2872.23087310791, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2423.986883163452, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2645.1630306243896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2639.1614151000977, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2532.3561477661133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2830.993137359619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2881.0789012908936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2424.3142414093018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2679.1438388824463, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2639.4916915893555, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2547.5068759918213, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2849.4545555114746, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2885.873441696167, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3568.21439743042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3626.3113594055176, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3677.0099449157715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3907.2679710388184, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3716.8251419067383, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3905.219192504883, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3793.213596343994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3907.347011566162, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1440.1521587371826, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1458.5476779937744, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1645.1446533203125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1683.3689403533936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1552.6040029525757, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1485.0561618804932, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1873.8601684570312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1888.633918762207, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1471.478238105774, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1467.3521614074707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1641.1303901672363, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1640.5348873138428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1581.9297647476196, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1483.4915208816528, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1885.3987216949463, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1892.7788829803467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1466.513442993164, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1466.6603136062622, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1645.2782154083252, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1642.2620868682861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1580.8089590072632, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1478.3070468902588, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1876.915683746338, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1898.0444622039795, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1468.7383937835693, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1467.1684789657593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1654.6161556243896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1647.7076816558838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1579.432315826416, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1474.7764825820923, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1881.3409423828125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1901.9969367980957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2239.4564723968506, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1791.3531303405762, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2586.477117538452, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2057.7115058898926, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2309.9473571777344, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1789.504976272583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2661.491184234619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2058.5496044158936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2323.161449432373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1791.2844944000244, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2663.17120552063, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2063.3236694335938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2339.2657375335693, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1804.5136070251465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2669.210557937622, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2065.5083179473877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1232.027039527893, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1318.1190490722656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1317.3673486709595, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1269.2607975006104, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1309.6385526657104, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1291.9852876663208, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1257.6239919662476, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1320.0656032562256, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1274.7187089920044, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1288.316798210144, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1316.1670446395874, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1253.4862327575684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1251.3371229171753, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1314.6345663070679, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1285.5884838104248, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1283.6652755737305, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1318.3768033981323, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1259.4539213180542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1253.532633781433, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1326.0905504226685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1289.2579221725464, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1284.3487977981567, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1318.4723281860352, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1264.8073625564575, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1751.1100769042969, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1707.0840072631836, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1777.3012828826904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1719.8964881896973, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1778.2929801940918, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1721.4891147613525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1786.482572555542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1726.038408279419, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1579.8164749145508, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1382.9824018478394, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1618.8468837738037, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1412.4438428878784, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1625.344476699829, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1417.297601699829, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1639.138422012329, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1414.4838380813599, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1911.3032245635986, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1967.8265571594238, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1752.2313499450684, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1803.6036777496338, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1897.3329639434814, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1926.4302253723145, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1710.7283210754395, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1766.4305400848389, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1995.8062553405762, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2069.9638271331787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1959.3830299377441, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1979.8966217041016, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1975.342903137207, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2003.9855957031252, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1900.9603214263916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1920.981912612915, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1997.518720626831, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2086.5457725524902, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1958.3865642547607, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1982.981767654419, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.9335842132568, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2009.5001602172854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1893.1991863250732, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1918.327522277832, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2006.7752265930174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2093.8532733917236, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1958.1025695800781, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1985.1593494415283, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1992.9534530639648, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2025.5969619750974, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1896.0628700256348, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1918.2876777648926, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2485.310583114624, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2392.2119998931885, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2708.517904281616, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2613.7056064605713, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2739.5281505584717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2627.535991668701, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2749.733934402466, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2627.3576068878174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1312.4516868591309, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1371.1948776245117, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1270.139832496643, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1263.5889625549316, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1305.927677154541, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.359040260315, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1343.798713684082, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1385.2571296691895, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1310.3020858764648, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1291.3950490951538, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.0172834396362, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1247.3668766021729, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1345.3529596328735, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1385.2844858169556, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1319.5417547225952, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1294.631519317627, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1328.6287927627563, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1260.3399991989136, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1364.982409477234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1405.8966398239136, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.6868772506714, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1312.0347356796265, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1341.5497636795044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1269.441270828247, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1848.6526489257812, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1629.4103908538818, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1871.0152053833008, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1676.5619087219238, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1874.618215560913, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1676.3841533660889, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1894.251365661621, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1689.7099018096924, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1476.5927982330322, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1110.1559972763062, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1330.5219173431396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1004.140796661377, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1501.026725769043, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1125.6663942337036, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1354.266881942749, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1019.5953702926637, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1508.3126401901245, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1133.9270496368408, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1352.785120010376, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1026.3347148895264, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1534.6483182907104, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1166.9036865234375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1380.561923980713, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1042.5006341934204, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5522.248821258545, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1113.4160041809082, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5470.490398406982, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1125.6692743301392, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5484.120445251465, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1136.4676904678345, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5520.324821472168, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1159.1534423828125, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2378.0892753601074, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1687.1630477905273, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2392.27313041687, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1698.1086158752441, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2512.5012588500977, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1811.3804912567139, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2538.1628704071045, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1804.7070598602295, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2522.376136779785, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1819.1190338134766, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2555.8112239837646, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1805.8307266235352, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2550.4505825042725, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.9092769622803, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2573.529920578003, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1825.0847816467285, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2023.0862617492678, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1507.276315689087, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2053.344955444336, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1537.781286239624, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2122.2268676757812, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1551.8648052215576, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2237.440004348755, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1632.534580230713, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8713.604316711426, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1220.901608467102, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8574.843406677246, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1244.5124912261963, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8596.102027893066, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1268.5089540481567, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8652.186546325684, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1299.2396926879883, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6719.045829772949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6646.787338256836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6822.736511230469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7198.8043212890625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9677.865180969238, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9703.671226501465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9818.126602172852, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10076.600303649902, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6609.2461013793945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6655.421257019043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6807.887229919434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7085.511932373047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9732.455673217773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9927.391128540039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10030.67813873291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10281.354789733887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6485.452919006348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6611.485710144043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6747.314453125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6970.997009277344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9748.673858642578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9867.727699279785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9958.968276977539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10270.403823852539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6382.735500335693, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6559.672164916992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6699.045562744141, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6900.298919677734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9748.525924682617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9864.921913146973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9995.019302368164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10280.852813720703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6825.440444946289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7223.251495361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7947.279357910156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9303.040809631348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9681.023712158203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9596.994743347168, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10191.981315612793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11701.329498291016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6696.771507263184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7077.397804260254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7618.75057220459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9020.933380126953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9724.002075195312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9601.309242248535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10163.078880310059, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11606.125602722168, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6585.039100646973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6958.767623901367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7512.055206298828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8863.439254760742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9687.592086791992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9602.247543334961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10194.06509399414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11706.768798828125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6529.177284240723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6899.077339172363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7474.810562133789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8827.585792541504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9687.701606750488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9602.64144897461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10169.466819763184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11735.445899963379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9477.139663696289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10695.657348632812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15290.210266113281, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15674.352188110352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11773.2661819458, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12233.904457092285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17582.018432617188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17966.37222290039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9243.216171264648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10297.956352233887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15336.405029296875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15736.24641418457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11684.675407409668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12215.528450012207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17559.83039855957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17977.873992919922, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9164.625511169434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10182.287864685059, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15369.207153320312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15773.988800048828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11736.568374633789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12148.052673339844, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17595.061569213867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 18015.470428466797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9149.334564208984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10125.5904006958, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15400.078659057617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15822.32666015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11784.226951599121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12156.501388549805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17620.61851501465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 18044.183044433594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5675.832767486572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5698.177127838135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5975.10383605957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6618.336334228516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6171.899662017822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6066.244468688965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6220.200786590576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6645.095367431641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5273.587017059326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5402.191505432129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5646.9794845581055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6251.028804779053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6061.439208984375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5777.230854034424, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5890.69356918335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6332.887668609619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5129.495010375977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5286.825923919678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5573.205261230469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6154.095039367676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5995.698890686035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5721.798915863037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5908.659362792969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6315.314407348633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5041.260147094727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5259.460315704346, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5519.79362487793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6125.026073455811, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5976.546192169189, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5714.399166107178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5892.145481109619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6302.975978851318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5894.132957458496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6954.125137329102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8886.41887664795, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8952.797317504883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6392.960987091064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6671.162910461426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8813.133926391602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9337.667121887207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5698.531799316406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6447.576484680176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7924.637184143066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8369.136543273926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6173.743152618408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6264.206714630127, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8743.07674407959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9117.575721740723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5611.920680999756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6356.739044189453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7934.805641174316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8377.519340515137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6116.063995361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6204.092330932617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8743.186950683594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9119.889526367188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5566.061267852783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6338.58283996582, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7934.606513977051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8394.662551879883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6071.737442016602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6201.879234313965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8753.512344360352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9142.232971191406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9188.326034545898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12029.552192687988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7759.568176269531, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10425.077896118164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8711.600723266602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11884.332962036133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7467.669868469238, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10386.559562683105, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8649.6439743042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11911.956939697266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7401.120491027832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10385.019073486328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8624.097480773926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11920.186614990234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7429.005966186523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10401.572341918945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5242.619190216064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5362.594890594482, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5777.8839683532715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6670.206451416016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5444.042701721191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5267.865428924561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5436.658191680908, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6524.570274353027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4925.76530456543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4999.675884246826, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5297.648010253906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5717.866916656494, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5186.768817901611, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5078.205165863037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5308.9606285095215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6162.939872741699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4826.736145019531, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4905.82498550415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5236.624011993408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5716.376152038574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5110.968036651611, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5017.948169708252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5274.105796813965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6174.01424407959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4792.508163452148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4855.534687042236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5203.804988861084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5710.014667510986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5096.951522827148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4953.240985870361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5224.959354400635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6158.796844482422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5487.086582183838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7942.040672302246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7822.897109985352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5868.934917449951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6912.585296630859, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6989.776382446289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5174.915199279785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6361.993618011475, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6542.569770812988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5491.790885925293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6112.320308685303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6322.597770690918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5086.477298736572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6363.957786560059, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6526.952133178711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5421.695194244385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6107.9155349731445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6322.765789031982, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5067.203693389893, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6360.346431732178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6526.633529663086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5434.59924697876, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6110.4375648498535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6325.835647583008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14440.595092773438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10029.375038146973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13654.854125976562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8831.327857971191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13490.739364624023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8784.872779846191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13635.31265258789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8793.098983764648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5438.973579406738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5876.558570861816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6112.789287567139, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5267.197914123535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5274.315223693848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5352.274875640869, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5036.7919921875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4881.283855438232, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4955.075969696045, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4842.444686889648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4561.748313903809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4639.947700500488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4948.326072692871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4874.974060058594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4946.101150512695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4721.37565612793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4536.0846519470215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4649.232139587402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4942.004203796387, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4873.052978515625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4942.622852325439, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4680.519638061523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4532.869606018066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4647.036476135254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7762.73006439209, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7243.764915466309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6887.067565917969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6274.78572845459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6919.398880004883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6312.515525817871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6959.618301391602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6354.217758178711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4622.491874694824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4635.30553817749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4815.625591278076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5102.1173095703125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5546.114749908447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5374.777774810791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5499.715843200684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5727.480506896973, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4762.509899139404, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4740.871715545654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4904.640007019043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5184.524936676025, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5479.5676612854, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5418.470726013184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5554.413585662842, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5727.69588470459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4751.7206382751465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4738.061141967773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4903.37345123291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5179.840145111084, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5485.44620513916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5408.506565093994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5529.57950592041, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5700.348815917969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4757.808666229248, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4751.684169769287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4917.278347015381, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5181.759376525879, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5499.914436340332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5461.9610023498535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5587.750091552734, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5740.932846069336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5121.468772888184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5075.416164398193, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5739.204483032227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7846.584854125977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5739.576950073242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5431.143550872803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6058.545303344727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8488.752326965332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5238.2073974609375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5097.761116027832, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5702.49870300293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7878.4770584106445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5780.096168518066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5491.426048278809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6018.958568572998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8512.580184936523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5250.671329498291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5082.024211883545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5694.69690322876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7919.532051086426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5784.579372406006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5471.473426818848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6036.342372894287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8564.574279785156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5288.1340408325195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5090.164642333984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5668.336143493652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7917.037124633789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5795.811672210693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5504.110870361328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5994.868011474609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8560.474281311035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6933.077812194824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9977.87338256836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10447.89264678955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6998.233451843262, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9419.699249267578, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9716.440353393555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6991.3177490234375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10006.340827941895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10453.75747680664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7009.168014526367, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9375.206527709961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9745.266418457031, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7065.209732055664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10026.421699523926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10478.891372680664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7125.404891967773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9385.543937683105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9792.298126220703, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7063.996353149414, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10041.904258728027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10494.761810302734, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7111.309547424316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9399.676361083984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9774.037818908691, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3976.5615844726562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3661.437587738037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3802.838077545166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4128.471527099609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4233.222236633301, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3965.367965698242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4101.369171142578, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4347.764015197754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3982.5880241394043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3647.1345710754395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3763.55411529541, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4108.3514976501465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4254.422721862793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3899.9710655212402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4067.5547218322754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4355.852947235107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3977.1823692321777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3639.569969177246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3756.5689849853516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4102.888488769531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4246.247692108154, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3916.4363288879395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4063.869132995605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4354.857635498047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3966.131076812744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3652.851333618164, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3771.1635208129883, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4087.170925140381, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4235.361251831055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3896.888198852539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4048.754539489746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4368.2402992248535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4860.128269195557, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4339.653720855713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5431.69454574585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5583.976955413818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4665.000591278076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4324.3256187438965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5608.589458465576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5810.016174316406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4878.243026733398, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4247.850227355957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5350.406894683838, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5542.430839538574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4630.462207794189, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4282.983207702637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5511.516456604004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5707.33154296875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4890.0346755981445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4245.938529968262, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5352.949619293213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5560.164966583252, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4683.6944007873535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4238.619518280029, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5513.0682945251465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5727.06579208374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4875.992813110352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4254.8846435546875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5354.137725830078, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5553.392639160156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4698.7470626831055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4241.160469055176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5504.038066864014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5713.783416748047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6160.447177886963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8896.777877807617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7142.779083251953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6663.7260818481445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5955.458526611328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8861.645164489746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7241.336822509766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6665.782623291016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5975.148658752441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8893.209762573242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7256.1761474609375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6693.633346557617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5989.556312561035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8896.155548095703, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7316.997108459473, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6686.792411804199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3411.756172180176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3311.597900390625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3872.435531616211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4017.947502136231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3651.4743995666504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3343.945598602295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3790.4512214660645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3914.3822288513184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3369.582862854004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3199.1705799102783, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3732.6598358154297, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3764.6467208862305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3665.0406455993652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3255.735673904419, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3813.533306121826, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3874.554023742676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3361.0094261169434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3179.795846939087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3726.3537979125977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3765.1804542541504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3694.184799194336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3269.3628883361816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3837.036647796631, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3890.145778656006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3347.337589263916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3169.575662612915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3718.5826110839844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3782.865791320801, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3685.0535583496094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3274.3803787231445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3838.623790740967, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3874.7361755371094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4412.995338439941, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4529.288959503174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4403.776073455811, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4407.669315338135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4226.780014038086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4427.486553192139, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4245.452346801758, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4343.118095397949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4215.660171508789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4430.881462097168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4276.342086791992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4352.206382751465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4197.697582244873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4424.772644042969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4249.654407501221, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4348.68782043457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9956.268844604492, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6132.611827850342, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9914.452590942383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5721.67423248291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9915.113525390625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5726.709575653076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9963.97087097168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5745.419521331787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3288.0713844299316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3440.7533073425293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3533.184986114502, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4117.703990936279, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3020.559377670288, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3217.171401977539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3135.1529598236084, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3363.2597160339355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3428.556308746338, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4233.564605712891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3077.113914489746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3138.3896160125732, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3087.5153827667236, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3367.1657371520996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3415.7278442382812, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4304.258728027344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3058.9667224884033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3139.2878437042236, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3039.786729812622, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3359.0796661376953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3409.5737838745117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4302.924461364746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3076.761131286621, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3132.5169563293457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5709.120826721191, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4337.77811050415, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5501.945781707764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3949.434070587158, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5529.47904586792, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3974.8748779296875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5538.884315490723, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3989.529285430908, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3579.679374694824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3477.0164680480957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3738.2571601867676, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4172.719345092773, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3737.8482055664062, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3696.956615447998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3858.394241333008, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4523.578262329102, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3870.459041595459, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3734.9222373962402, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3910.4451179504395, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4386.444339752197, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3927.2436904907227, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3854.030227661133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4021.6636466979985, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4648.143367767334, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3917.4765014648438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3744.125270843506, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3912.363510131836, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4372.611408233643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3943.5300827026367, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3885.4993438720703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4039.8006439208984, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4611.114368438721, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3929.060935974121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3757.2817611694336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3933.6196517944336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4405.962390899658, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3971.8227005004883, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3995.2252769470215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4136.6657638549805, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4662.1452713012695, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4300.489978790283, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4640.682849884033, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4648.847255706787, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4427.051029205322, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5055.168476104736, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5191.24719619751, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4579.711971282959, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4833.220100402832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4886.607837677002, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4585.250225067139, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5087.9157066345215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5206.540508270264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4609.048328399658, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4840.1225662231445, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4887.937641143799, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4592.612934112549, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5114.92338180542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5207.694129943848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4592.380828857422, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4966.184329986572, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4901.796016693115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4608.4792137146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5195.35774230957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5222.2881507873535, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6292.737102508545, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6416.084156036377, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6681.443252563477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7235.70613861084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6843.65852355957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7262.9949951171875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7099.631423950195, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7125.032081604004, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2660.330228805542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2699.741430282593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3008.1691455841064, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3090.1102352142334, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2841.2643146514893, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2739.583044052124, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3314.29386138916, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3351.932792663574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2697.7091312408447, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2719.2323207855225, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2911.317768096924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3014.7740650177, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2900.0027179718018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2746.8052864074707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3317.6177406311035, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3337.038097381592, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2701.7580890655518, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2711.64701461792, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2929.2761611938477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3015.688304901123, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2889.711494445801, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2723.3454418182373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3315.72359085083, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3343.924789428711, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2689.4153594970703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2693.5574340820312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2975.6169509887695, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3062.254867553711, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2888.156156539917, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2696.095027923584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3328.581771850586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3340.1158332824707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3976.032199859619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3242.522602081299, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4588.773288726807, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3661.8167686462402, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4165.911712646484, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3171.815528869629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4771.067371368408, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3660.054931640625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4221.89245223999, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3169.11057472229, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4794.654693603516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3656.919403076172, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4282.778377532959, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3213.0689430236816, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4823.065624237061, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3661.6787147521973, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2250.163679122925, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2389.010238647461, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2427.599687576294, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2276.1079692840576, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2301.0649585723877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2359.819211959839, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2212.0164680480957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2294.3850994110107, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2225.3878116607666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2271.3478469848633, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2296.613130569458, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2195.966739654541, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2189.402551651001, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2299.847345352173, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2223.944625854492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2253.1158351898193, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2303.5876655578613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2190.855369567871, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2189.1111850738525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2315.421733856201, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2252.940788269043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2252.782096862793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2316.4827251434326, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2211.952495574951, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3048.3747005462646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2964.126558303833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3124.6545600891113, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3031.409730911255, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3150.1737689971924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3046.3764667510986, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3191.504487991333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3081.980972290039, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2653.1222343444824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2369.8475074768066, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2735.572328567505, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2435.5695819854736, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2742.113780975342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2421.7654418945312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2770.2352046966553, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2436.0139179229736, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3506.628475189209, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3608.0536460876465, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3371.234073638916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3577.9761505126953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3515.850601196289, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3516.7840003967285, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3316.903839111328, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3506.32869720459, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3858.8800048828125, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3948.9412879943848, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3792.635040283203, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3824.6449851989746, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3800.840301513672, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3813.681240081787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3674.298572540283, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3716.061420440674, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3886.4574241638184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3977.9373359680176, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3791.8465995788574, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3967.6192474365234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3823.4302139282227, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3834.3567848205566, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3667.2668838500977, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3859.7997093200684, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3906.6576194763184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4005.243988037109, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3789.136619567871, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4155.125885009766, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3843.2198333740234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3869.423007965088, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3676.29695892334, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4045.7315063476562, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4659.537754058838, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4484.645309448242, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5117.50452041626, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4955.887184143066, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5172.786407470703, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4974.456748962402, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5216.9169998168945, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5019.363479614258, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2496.210880279541, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2517.8284740448, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2396.563034057617, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2446.7660903930664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2379.775342941284, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2349.7417545318604, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2573.028335571289, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2571.6137504577637, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2496.100015640259, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2512.514228820801, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2459.3337535858154, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2386.9185638427734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2580.7145404815674, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2583.99582862854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2548.3105659484863, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2515.1895904541016, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2473.508176803589, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2437.9619312286377, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2594.046697616577, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2616.5042972564697, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2580.185146331787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2524.306221008301, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2493.4907245635986, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2472.1491050720215, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3462.219524383545, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2972.778091430664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3604.2043113708496, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3131.0430335998535, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3639.336452484131, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3161.7191791534424, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3681.749267578125, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3194.309787750244, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2761.0827255249023, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2083.7707138061523, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2475.4097652435303, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1823.7574291229248, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2826.931371688843, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2091.691026687622, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2516.9739151000977, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1826.8292713165283, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2845.4255962371826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2107.886390686035, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2531.731996536255, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1830.2620792388916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2886.7788696289062, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2124.3949031829834, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2532.0750427246094, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1843.4707164764404, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9357.18978881836, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1935.8806419372559, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9299.665985107422, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.0145473480225, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9339.782829284668, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1995.8995532989502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9433.947868347168, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2019.573745727539, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4189.321727752686, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3215.68660736084, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4252.872180938721, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3235.896167755127, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4701.358585357666, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3447.967052459717, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4758.719863891602, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3430.4079818725586, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4717.36701965332, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3460.278377532959, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4778.1086349487305, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3441.537628173828, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4823.398418426514, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3521.0648155212402, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4879.204940795898, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3506.706199645996, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3868.5687828063965, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2769.857921600342, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3918.009262084961, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2919.9692916870117, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4039.7025489807124, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2953.751850128174, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4260.711822509766, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3060.1609802246094, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15353.79753112793, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2124.056167602539, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15376.233596801758, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2176.843204498291, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15412.657318115234, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2215.5020904541016, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15426.503601074219, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2279.019536972046, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.45039999485016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 150.69792091846466, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 149.7355192899704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.03056073188782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.95344066619873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.68080008029938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.060959815979, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.80224061012268, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.10928070545197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.01664006710052, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.15584015846252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.48559999465942, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.52992033958435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.29471957683563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.09200143814087, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.99055922031403, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.7033599615097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.1153599023819, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.17295920848846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.90783989429474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.49520087242126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.0083191394806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.65439975261688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.8662406206131, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.72784006595612, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.49439918994904, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.37823963165283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.07344043254852, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.99295926094055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.76927947998047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.36784040927887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.74752008914948, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.42127990722656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.4313609600067, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.25791907310486, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.77423977851868, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.5366405248642, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.3931188583374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.56047928333282, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.1374410390854, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.86272037029266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.91935896873474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.21360063552856, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.21856009960175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.04879999160767, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.9203199148178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.05248022079468, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.78048050403595, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.9871997833252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.03648006916046, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.37999892234802, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.65424036979675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.96384024620056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.22560095787048, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 185.2950394153595, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.53231966495514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.79983949661255, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.01967930793762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.86303961277008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.01760005950928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.16432011127472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.55104076862335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.43856120109558, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.3924798965454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.37008094787598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.56496012210846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 230.8844769001007, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 230.60800075531006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.93551993370056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.59008073806763, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.19871830940247, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.59888100624084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.19296061992645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.89247858524323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 231.6652810573578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 231.54736042022705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.4027203321457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.43359971046448, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.83327770233154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.00016021728516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.25824058055878, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.32352018356323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.94415950775146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.33104038238525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.81968021392822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.57551956176758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.4515197277069, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.6992003917694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.36015856266022, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.26719880104065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 233.24703931808472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.65280055999756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.49711906909943, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.2491194009781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.32896256446838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.16896200180054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.69264030456543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.8857605457306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.9398386478424, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.15072095394135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.40432024002075, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.5068792104721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.16815972328186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.4772790670395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.5630396604538, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.1950399875641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.1499207019806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.70640003681183, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.53648054599762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.3998395204544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.1524807214737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.98399913311005, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.26672065258026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.55951976776123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.85712039470673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.31472027301788, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.61184084415436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.39295935630798, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.16511988639832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.1276797056198, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.66991865634918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.24351906776428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.010560631752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.34303975105286, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.00735998153687, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.10719847679138, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.199840426445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.44687938690186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.31727957725525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.80768084526062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.3873610496521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.91631960868835, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.51616048812866, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.96559989452362, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.90031898021698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.19999992847443, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.17680060863495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.08559834957123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.57824039459229, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.59632062911987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.72127985954285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.92304074764252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.01920127868652, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.90704035758972, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.1710386276245, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.22160017490387, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.5967993736267, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.1868795156479, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.22224009037018, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.82863926887512, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.09647965431213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.8220797777176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.36751890182495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.4454402923584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.51712048053741, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.59407937526703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.8430414199829, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.7148813009262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.44256055355072, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.89279973506927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.44048023223877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 204.4696009159088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.13263976573944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.49151873588562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.14208030700684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 200.76879978179932, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.95296132564545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.8542401790619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.1551994085312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 203.20144057273865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.4385598897934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 185.22816061973572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.55791974067688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 203.7329602241516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.5001586675644, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.52832078933716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.74160051345825, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.60736083984375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.7956793308258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.51119947433472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.56512022018433, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 150.91855883598328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.354079246521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.14991998672485, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.98976004123688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.9126387834549, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.73264026641846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.83071970939636, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.6275199651718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.50543999671936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.83551919460297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.77504110336304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.42975914478302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.38704097270966, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.93152022361755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.01104032993317, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.4513601064682, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.91376042366028, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.08207952976227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.8555190563202, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.99152040481567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.71023952960968, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.76879930496216, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.01855874061584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.0619193315506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.3384004831314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.10511946678162, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.87984085083008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.75295972824097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.15808033943176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.03152060508728, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.16528034210205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.6841596364975, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.0489593744278, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.21919977664948, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.06384015083313, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.29728066921234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.78608000278473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.76623928546906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.4524803161621, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.56639957427979, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.89151895046234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.12063896656036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.67711997032166, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.52255988121033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.38416051864624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.86464047431946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.4291205406189, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.11039912700653, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.26736080646515, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.52336061000824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.90527963638306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.579039812088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.1216002702713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.8436801433563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.86928033828735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.44528126716614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.8523187637329, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.23792338371277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.60656070709229, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.0004804134369, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.37855851650238, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.8857593536377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.63296175003052, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.0030403137207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.63072049617767, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.2297601699829, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.67119979858398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.9124791622162, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.50512146949768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.55184054374695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.19440078735352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.49727964401245, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.37599980831146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.25760090351105, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.9635202884674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.59504067897797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.15344083309174, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.53616058826447, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.9177609682083, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.34287905693054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.49551928043365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.4083207845688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.47408092021942, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.68063962459564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 185.73984026908875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.1723198890686, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.3742400407791, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.33616054058075, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.03407907485962, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.89024031162262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.23391997814178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.96016025543213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.91359865665436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.34271812438965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.62304186820984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.21487975120544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.9670408964157, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.75471985340118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.69840002059937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.1233607530594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.74863970279694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.4263995885849, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.10112011432648, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.45759975910187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.896479845047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.36655950546265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.75375962257385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.97776019573212, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.56959974765778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.4251207113266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.47439908981323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.0039985179901, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.14047861099243, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.53871977329254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.80736076831818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.13104021549225, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.0048007965088, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.28239905834198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.5307193994522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.6656002998352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.29392170906067, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.59440004825592, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 196.32783830165863, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.62688064575195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.86799955368042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.32800030708313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.2555195093155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.5619192123413, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.12976050376892, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.44992101192474, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.8102412223816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.6822406053543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.32912051677704, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.38991963863373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 200.06319761276245, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.0823996067047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.89967954158783, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.32176005840302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.55328011512756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.68655967712402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.67535960674286, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.22607898712158, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.72272372245789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.86719965934753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.46352005004883, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.32240045070648, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.2708775997162, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.3342399597168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.88031935691833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.73312056064606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.58175897598267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.95983910560608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.70463871955872, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.7838408946991, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.30447924137115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.78480052947998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.6644802093506, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.07919669151306, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.48752164840698, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.25311756134033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.39935839176178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.4579187631607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.26864171028137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.88159823417666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.4624000787735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 224.87695813179016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.32368111610413, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.20944106578827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.81952118873596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.83712244033813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.49727964401245, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 227.5519984960556, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.5512011051178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.51216113567352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.13535904884338, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.40959930419922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.63311982154846, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 227.93264091014862, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.2776017189026, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.743199467659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.2022407054901, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.12400043010712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.8691202402115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.7564799785614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.09503960609436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.60639917850494, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.13216030597687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.63296020030975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.82367980480194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.12992131710052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.8766404390335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.47664082050323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.93616092205048, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.984800696373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.757758975029, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.92400085926056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.69823813438416, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.0063999891281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.42656135559082, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.55247914791107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.61103999614716, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.29135990142822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.19727861881256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.7915209531784, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.5574390888214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.07968151569366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.80560171604156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.5083202123642, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.39759957790375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.0598406791687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.25311923027039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.8224000930786, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.68255925178528, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.7468799352646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.32255935668945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.59264087677002, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.27200043201447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.04255974292755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.90607941150665, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.24704158306122, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.91567969322205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.55248081684113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.17872047424316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.4097602367401, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.06048047542572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.67519783973694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.71871995925903, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.4862381219864, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.17551958560944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.40143883228302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.81695973873138, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.553280711174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.13776004314423, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.33488059043884, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.72944116592407, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.37648034095764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.55328047275543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.49967873096466, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.27728056907654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.9236809015274, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.89984214305878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.43728017807007, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.89904034137726, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.45871901512146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 231.47487878799438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 207.36495971679688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.82607913017273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.81967997550964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 230.35696029663086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.04048132896423, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.15471816062927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 185.25120079517365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 229.41807985305786, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.3483190536499, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.93311965465546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.84720063209534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 231.87552213668823, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.9705581665039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.8438402414322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.63039934635162, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.62480008602142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.74032056331635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.34880018234253, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.56303906440735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.83616018295288, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.4902399778366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.9276807308197, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.41120088100433, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.79808020591736, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.81232011318207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.91087925434113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.08816003799438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.1292805671692, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.9582403898239, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.33183991909027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.96863925457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.0087994337082, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.51327979564667, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.3899201154709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.47488141059875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.76576161384583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.35455989837646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.48287868499756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.28928089141846, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.15184140205383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.0643196105957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.66607999801636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.8403195142746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.78703904151917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.29919981956482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.48640024662018, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.83759808540344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.90367913246155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.05728149414062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.45280003547668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.10607981681824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.1484798192978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.722238779068, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.0113605260849, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.8334412574768, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.17631912231445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.28896045684814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.50672006607056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.0951999425888, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.9264007806778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.1619223356247, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.7431995868683, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 315.8030414581299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 186.55376076698303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 314.1470408439636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.68752002716064, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 313.718878030777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.74671971797943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.5377595424652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.49568057060242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 211.00928008556366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.43535923957825, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.0980784893036, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 205.11056005954742, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.88384091854095, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.31887888908386, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 212.02159881591797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.9027200937271, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.79312086105347, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 209.25599932670593, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.1166399717331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.47167909145355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.49951922893524, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.47871923446655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.02896010875702, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.77856063842773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.92063987255096, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.3062402009964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.55487871170044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.12224090099335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.82448017597198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 209.62159872055054, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.2233612537384, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.0076801776886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.44864201545715, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.7331190109253, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.0635199546814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.725279211998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.3529658317566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.99152100086212, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.4566388130188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.22880148887634, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.69360053539276, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.85759973526, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.20048010349274, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.20736038684845, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.79168117046356, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.96319949626923, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.79311990737915, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.34767985343933, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.6791990995407, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.53760063648224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.03488051891327, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.14975833892822, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.9319999217987, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.62143921852112, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.70431971549988, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 212.20144152641296, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.51007986068726, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.04687929153442, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.49151968955994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.13551926612854, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.64543986320496, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.1158413887024, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.14223992824554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.34960162639618, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.4664009809494, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.4425595998764, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.08463847637177, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.81535875797272, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.87183952331543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.7304002046585, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.3415995836258, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.64767956733704, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.73680138587952, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 213.74927937984467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.4323160648346, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.87696170806885, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.11264038085938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.46831893920898, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 196.74911975860596, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 212.60720074176788, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.73920142650604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.6107213497162, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 229.1430377960205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 226.0867202281952, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.46607875823975, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 216.69024109840393, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.80512046813965, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 233.33600163459778, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 231.73727869987488, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 229.18943762779236, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.16432082653046, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 212.0748782157898, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.86415767669678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.87392020225525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.79472136497498, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 230.30336260795593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.5622384548187, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.5008008480072, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.1742386817932, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.405118227005, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.7708809375763, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 300.2516806125641, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 292.02272057533264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 297.50223755836487, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.60176002979279, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.60128045082092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.7908810377121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.66800010204315, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.74912095069885, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.0027197599411, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.6252804994583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.21456038951874, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.34960079193115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.7959998846054, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.76576030254364, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.8563185930252, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.71999967098236, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.314399600029, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.04816055297852, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.10815978050232, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.78799974918365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.16352033615112, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.9838389158249, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.23151993751526, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.89567971229553, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.15759921073914, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.25039982795715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.20527935028076, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.81984090805054, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.04127979278564, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.13072037696838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.015199303627, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.47120141983032, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.4281586408615, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.71311902999878, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.7104001045227, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.92512094974518, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.34607899188995, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 227.73215889930725, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.33023929595947, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.14239859580994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.5110386610031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.7659239768982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.1804802417755, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 205.0601589679718, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.49696099758148, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.99072408676147, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.093279838562, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.10463917255402, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.28672075271606, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.5974419116974, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.06976056098938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.73232126235962, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.281919836998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.064000248909, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.10015964508057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.7222397327423, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.23727977275848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.35791981220245, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.44047915935516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.52815961837769, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.59343934059143, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.45616137981415, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.28223931789398, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.36703991889954, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.74832117557526, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.2865605354309, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.46239984035492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.4827196598053, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.3974405527115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.3734403848648, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.63552105426788, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.650399684906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.8736013174057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.9953602552414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.8182407617569, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.12192142009735, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.96160101890564, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.69151973724365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.7414401769638, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.54399859905243, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.46752035617828, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.10336124897003, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.71616005897522, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 227.779198884964, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 207.62336134910583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.58287930488586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.46415948867798, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 227.30000138282776, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 205.49311876296997, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.69855880737305, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 204.8859190940857, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.0721607208252, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.7169587612152, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.03184056282043, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.80335879325867, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.05072236061096, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.18207812309265, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.00400233268738, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.38623976707458, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.5219204425812, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 277.09375977516174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.77520370483398, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.65999960899353, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 292.14239954948425, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.40320086479187, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.74687957763672, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.80896162986755, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 287.74927973747253, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.74896335601807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.5024013519287, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.53792142868042, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.2574405670166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.5294370651245, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.6815996170044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.3460793495178, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.96991991996765, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 278.9619183540344, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.34991979599, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.07568192481995, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.84912037849426, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.0611209869385, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.1238396167755, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.45407891273496, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.4670422077179, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 341.28031969070435, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.1727979183197, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 339.3601596355438, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.49743843078613, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 337.02688217163086, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.79840064048767, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 334.4364798069, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 204.90207970142365, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.42191970348358, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.72048115730286, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.3379204273224, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.38480019569397, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.08336114883423, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 205.77903747558594, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.92095935344696, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.41392064094543, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.01472079753876, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.48048055171967, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.8647998571396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 203.16927909851074, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.9374407529831, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.26367938518524, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.61232113838196, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.0217628479004, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.95680105686188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 203.39184165000916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 200.48672378063202, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.26671886444092, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.8686408996582, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.29791975021362, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.61423909664154, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.8993618488312, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.57088112831116, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.0646402835846, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.96399927139282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.3617603778839, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.1057629585266, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.80623984336853, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.5449616909027, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.28208136558533, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.99296081066132, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.6732804775238, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.4488000869751, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.94768166542053, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.64687943458557, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 209.83007788658142, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.3123208284378, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.6771218776703, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.56256079673767, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 209.6016013622284, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.4497607946396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.54175853729248, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 185.20048022270203, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 209.4760024547577, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.41823995113373, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1046.9865655899048, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.89664149284363, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1039.7495985031128, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.25871801376343, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1048.6976099014282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.28000116348267, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1049.9424028396606, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.4414393901825, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.833432674408, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.8878357410431, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.4657554626465, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.32623958587646, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.890079498291, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.58991980552673, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.1241636276245, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.22288155555725, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.0121564865112, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.59711813926697, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.8275213241577, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 399.49488282203674, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.8575963973999, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.46767950057983, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.6048030853271, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 402.61215806007385, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.027045249939, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.73151993751526, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.4056024551392, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.46607971191406, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.9331202507019, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.94943833351135, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.6887955665588, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.10223841667175, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2004.9923133850095, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 280.5076801776886, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2004.9726390838623, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 279.371680021286, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2009.775676727295, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 277.35008120536804, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2001.6785526275632, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.7851207256317, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.5899221897125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.41791915893555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.3571183681488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.5233588218689, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.6763210296631, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.85487961769104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.32143998146057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.7729594707489, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.96288204193115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.6679992675781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.2633628845215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.765442609787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.894079208374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.8280007839203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.6251208782196, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.41264390945435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.40736150741577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.8375999927521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.15296244621277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 349.55039858818054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.0814392566681, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.0839982032776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.1713614463806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.5841598510742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.0174403190613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.9780797958374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.2766389846802, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.79936051368713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.05168080329895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.54928064346313, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.0457592010498, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.09791898727417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.1951994895935, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.7022387981415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.80336117744446, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.1308796405792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.82352089881897, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.44288086891174, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.932000875473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.73775911331177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.8291189670563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.7942407131195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.31103920936584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.4702398777008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.67232060432434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.1244761943817, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.04992294311523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.802237033844, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 344.79056000709534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.85151982307434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.01856303215027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.92719650268555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.5420799255371, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.1031982898712, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.7860805988312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 391.2289583683014, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.497918844223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.41663932800293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.31024146080017, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.9520003795624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.7297646999359, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.302401304245, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.9635193347931, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.87743949890137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.0740761756897, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.2872009277344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.44592332839966, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.9347233772278, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.2379174232483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.87295484542847, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8046350479126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9209585189819, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.44351983070374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.25920033454895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.3529648780822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.91695976257324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 425.6774389743805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.61424112319946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.1743965148926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.7612724304199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.32080006599426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.9553575515747, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.55631923675537, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2961602210999, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 421.61983847618103, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.2192015647888, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6585645675659, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.4771203994751, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.9540753364563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 397.08863854408264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.99551916122437, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.04512166976934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 414.6875214576721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 420.44528007507324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5196838378906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.514726638794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.43135929107666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 339.58848118782043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.0415999889374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.9095993041992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.9635200500488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.93136048316956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.6720037460327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.2540822029114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.1446385383606, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.32384276390076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.43264079093933, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.9241580963135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.94656109809875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.6916787624359, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.65856170654297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.6825602054596, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.0950403213501, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 341.18223786354065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.48560094833374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.95504117012024, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.0113613605499, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.9260823726654, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.3894410133362, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.22528100013733, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.30623841285706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 341.11727833747864, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.6046419143677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.47183990478516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.12752079963684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.4619174003601, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.64095997810364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.90176010131836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.80735778808594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 344.86608266830444, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.4193594455719, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.2404816150665, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 349.7056007385254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.04111981391907, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.0017580986023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.52351927757263, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.01024055480957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 349.2580807209015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.84815788269043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.32255959510803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.61375856399536, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.4623987674713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.37487840652466, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.48256158828735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.93232226371765, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.7238392829895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.56447982788086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.5804777145386, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.865761756897, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.7179214954376, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.51279640197754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.67648005485535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.8604781627655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.3808009624481, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.20687770843506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.6556794643402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.6340775489807, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.7652778625488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.4944031238556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.5984010696411, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.55359721183777, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.8644742965698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.25072288513184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.88320446014404, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.0764811038971, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.0502419471741, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.0169606208801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.14671874046326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 397.8484785556793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.98735666275024, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.6444787979126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 413.2750380039215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.47216176986694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.9644775390625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.2780821323395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 412.89552330970764, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.72896122932434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.0860800743103, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.5313606262207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.76383924484253, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.6635229587555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.0982406139374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.207679271698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.72832131385803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.8540816307068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.13647985458374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.7230398654938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.81967878341675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.9747188091278, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.5580792427063, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.17903995513916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.0478403568268, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.17711877822876, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.62160062789917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.1153597831726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.8246397972107, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.45887994766235, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.90367913246155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.9979181289673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.45792174339294, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.8243193626404, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.0964787006378, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.67455887794495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.9927999973297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 341.6254389286041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.0527992248535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.40320205688477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.9851191043854, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.60816383361816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.03568053245544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.6684787273407, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.78528237342834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.65312099456787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.7436797618866, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.4284813404083, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 379.1543996334076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.04896235466003, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.99823808670044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.2345595359802, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.42816138267517, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.43200159072876, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.5271985530853, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.38592052459717, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.9892797470093, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.82623839378357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.7487995624542, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.9412808418274, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.19039821624756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.2830390930176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.2203199863434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.511198759079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.1075220108032, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.1379237174988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 417.2507178783417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.6563243865967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 416.0483229160309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.801598072052, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 419.39695596694946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.7751998901367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 416.7587184906006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.5607979297638, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.439838886261, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.6131205558777, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.41167879104614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.688481092453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.7660789489746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.9688003063202, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.31936287879944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.59983706474304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.0332806110382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.2094385623932, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.1113615036011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.68111872673035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.59967947006226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.39983916282654, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.1115207672119, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.2902412414551, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.9166407585144, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.65312099456787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.84272170066833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.77775979042053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.8860788345337, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.5297598838806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.07247829437256, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.2803225517273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.4257607460022, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 422.7028822898865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 411.20911955833435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 424.1646361351013, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 403.1371212005615, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.1009578704834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 397.014080286026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.9987196922302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.6006398200989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.38112235069275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.65184020996094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 418.2320022583008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.1993598937988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.6611201763153, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.2161555290222, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.8483204841614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.9476807117462, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.3263998031616, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.547518491745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.70048213005066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.4912037849426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.42352056503296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.0630407333374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.2523195743561, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.7355201244354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.9225609302521, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.8668808937073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.93343901634216, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.28959941864014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 388.19488048553467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 388.08544278144836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.1404821872711, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.07983922958374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.0779182910919, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.40111804008484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 381.7400002479553, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.19904088974, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.0273609161377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.0462417602539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.73519921302795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.0271999835968, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.5371198654175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.8923192024231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.2135977745056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.08367919921875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.34271788597107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.241916179657, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.4007980823517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.72255992889404, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.2268810272217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.80208444595337, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 402.5428819656372, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.8991997241974, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.215039730072, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.2887969017029, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.5060818195343, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.52143955230713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.6553599834442, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.00783801078796, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 400.23695826530457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.11504340171814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.5068814754486, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.24592113494873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.6883237361908, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.9115207195282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.4185588359833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.7844805717468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.15007758140564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.24959993362427, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.43903732299805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.41440057754517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.4249610900879, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.7996826171875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.9969639778137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.81311559677124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7753615379333, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.9249625205994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.10383892059326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.3755221366882, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.4019150733948, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.260968208313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.7377586364746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.5563173294067, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.3742370605469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.6419191360474, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.2289552688599, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.3734402656555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.0355205535889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.8134393692017, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.6851134300232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.3137617111206, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.0582404136658, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.7771203517914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.0260806083679, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.2487988471985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 379.76768016815186, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.4931173324585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.58335995674133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.679678440094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.4758417606354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.73792028427124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.57776141166687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.63135838508606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.1825575828552, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.11215901374817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.2548773288727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.073118686676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 394.5423996448517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.81087851524353, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.5443186759949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.0348825454712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 394.0619206428528, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.4127984046936, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.879998922348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.7588815689087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 400.3596806526184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.56176352500916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.20928263664246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.5992012023926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.40607810020447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.5083222389221, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.50928235054016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.85007905960083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.11424136161804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.7977600097656, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.62751722335815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.4003210067749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.8984007835388, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.5193591117859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.09023809432983, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 402.92192339897156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.5527982711792, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.27455735206604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 381.62495970726013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.2089583873749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.3169593811035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.44816279411316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.5252802371979, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.53264117240906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.8889584541321, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.59439992904663, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.7447998523712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.9720034599304, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.84640192985535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.8728015422821, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.87167477607727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 397.1996808052063, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.19983768463135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.58976125717163, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.97712087631226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.99295687675476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.16992020606995, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.27711820602417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 388.1816029548645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 403.5488021373749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 381.84383749961853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.83935832977295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.4552011489868, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.722718000412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.60400581359863, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.3881583213806, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.2761559486389, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.42463731765747, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.9806408882141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.4912042617798, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.49087476730347, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.0980851650238, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.39424085617065, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.8846406936646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.7939205169678, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.7867183685303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.02688121795654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.5734438896179, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.86848306655884, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.5278356075287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.7054388523102, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.00784373283386, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.3088004589081, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.62479877471924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.85263562202454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 379.19536113739014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.7564797401428, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.08079981803894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.53103971481323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.3243179321289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.4099187850952, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.8193590641022, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.1049563884735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.12287974357605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.2011194229126, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.7438397407532, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.313600063324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.6476786136627, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.011682510376, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.8743999004364, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 403.64431858062744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.81647849082947, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.33088088035583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.92848014831543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.68943977355957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.0681571960449, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.2251205444336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.4359998703003, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 410.67471742630005, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.6622385978699, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.65808057785034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.219521522522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.8302412033081, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.39375948905945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 418.1006383895874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.7520024776459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.27135276794434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.8844804763794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 425.2235162258148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.9435176849365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.2632019519806, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.34111857414246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 427.8235173225403, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.71247720718384, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 400.7270383834839, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 391.0100769996643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 428.24560165405273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.6801574230194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.2201633453369, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.756959438324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 785.9078407287598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.2998352050781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 773.4543943405151, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.35072088241577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 782.4633526802063, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.55023884773254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.28512144088745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 401.4401614665985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 391.83664202690125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.3164849281311, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.30783796310425, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.61056089401245, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.44704246520996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.1374409198761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.653920173645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.7168028354645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.89503931999207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.4851200580597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.1511986255646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 402.015997171402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.6796820163727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.1756854057312, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.6379179954529, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.1532790660858, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.89264130592346, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.28080463409424, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 391.7915213108063, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.0246386528015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.45679998397827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.09855461120605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.7404832839966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 427.10432052612305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.0264048576355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.5956811904907, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.1567983627319, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 416.99999809265137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.6719994544983, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 422.7384042739868, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 413.5652816295624, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.6907217502594, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.21552205085754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.27584409713745, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 431.3356876373291, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.71440410614014, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.39040207862854, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.7235198020935, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.8988780975342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.2407991886139, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.12719988822937, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.5287938117981, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.84703731536865, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.74496126174927, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 409.3963158130646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.32032203674316, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 418.19632291793823, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.04255747795105, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.824960231781, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.679847240448, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.15071964263916, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 388.77920031547546, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.0116775035858, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.348002910614, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 413.7985599040985, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.18127727508545, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.6857604980469, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.0094413757324, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.58800172805786, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.2455987930298, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.30256032943726, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.34704065322876, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.5243215560913, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.8396773338318, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.1598482131958, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.4027171134949, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1940841674805, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.6007943153381, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.8694396018982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.1163239479065, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.8313660621643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.6585597991943, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.717116355896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.5123205184937, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.64112520217896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.83039999008184, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.3241629600525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3905620574951, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.8428783416748, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.1305575370789, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.13823795318604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.60111999511713, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.55247831344604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.7777619361877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.4174389839172, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.3292832374573, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.331356048584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.359842300415, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.4406394958496, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.181435585022, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 686.800799369812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.3745579719543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.4708819389343, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.721284866333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.24911761283875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.1425585746765, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.341760635376, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.20928168296814, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.83712220191956, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.58672165870667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.88511657714844, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.49280071258545, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.1388795375824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.2590401172638, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.95647978782654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 379.3675231933594, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.71935749053955, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.8390402793884, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.14048051834106, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.55919790267944, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.6742398738861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.6772794723511, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.2427203655243, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.82736015319824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.0296006202698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.3807978630066, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.18912267684937, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.3033585548401, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.3095989227295, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.48624062538147, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.5376012325287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.80928087234497, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.2969596385956, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.59616136550903, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 394.25456166267395, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.13376331329346, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.06367921829224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 416.65743827819824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.2703976631165, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 410.9447991847992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.2446389198303, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 421.3369596004486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.409761428833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 413.63168001174927, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.2580828666687, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 415.29184341430664, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8752021789551, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 414.4822382926941, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.02303981781006, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 421.29440784454346, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.1267223358154, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 412.3483216762543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.3872013092041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.96912026405334, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.2875213623047, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 407.02927470207214, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.87807965278625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.50127840042114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 394.9934387207031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.57168340682983, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.295681476593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.59535121917725, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.4092798233032, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.00784134864807, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.4246428012848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.40368127822876, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.07056045532227, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.6971187591553, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.86463952064514, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.40416073799133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.06096363067627, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.21984028816223, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.0948803424835, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 406.5184020996094, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.0825581550598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.2387239933014, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.95408487319946, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 424.2969584465027, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 431.90751791000366, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 426.376314163208, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 429.4601607322693, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 427.5508785247803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.93439769744873, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.7883222103119, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.1495933532715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.93279933929443, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.0990381240845, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.5747184753418, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.3980793952942, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.6902446746826, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.1707158088684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.9801559448242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.3433632850647, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.4752011299133, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.4640030860901, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.419683933258, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 685.7265591621399, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.6284809112549, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.9652786254883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.1249613761902, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.6464033126831, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.178081035614, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.9079976081848, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.0596823692322, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.9752039909363, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 637.0489621162415, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.6879992485046, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.2524814605713, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.7729578018188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.2892823219299, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.4974374771118, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.0148782730103, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 676.7872071266174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.55504322052, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.8300805091858, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.6980786323547, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.3321633338928, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 650.3438329696655, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.3699131011963, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.726719379425, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.2940802574158, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.5793724060059, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.8817591667175, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.19952917099, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 816.7271971702576, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 804.858386516571, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.522566318512, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 784.1417622566223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 815.1971197128296, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 789.3760061264038, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.2936053276062, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.3564829826355, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.9671995639801, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.3204846382141, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 425.29152154922485, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.5383915901184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.8899235725403, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 410.2552008628845, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2796783447265, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.8385579586029, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.46512031555176, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.05856132507324, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.22304010391235, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 417.2472023963928, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.56783390045166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.4348797798157, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 428.0430340766907, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.62896156311035, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.68944025039673, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 413.3393609523773, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.1817612648011, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.4132823944092, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 425.84911823272705, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.95551919937134, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.0324811935425, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.69728088378906, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.13840675354, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.4559993743896, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.6900806427002, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.984959602356, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.1736001968384, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.9135999679565, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.827356338501, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.0371255874634, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.9036831855774, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.5608024597168, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.86783313751215, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 400.36367893218994, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.0955200195312, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.97567653656006, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.4403190612793, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.0486397743225, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.8708834648132, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.81536054611206, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8788771629333, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 401.7859184741974, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.4353585243225, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.0551996231079, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3358335494995, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 403.38640093803406, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2157.370252609253, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.2390398979187, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2155.1417446136475, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.4395213127136, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2131.9035243988037, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.1879997253418, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2130.774555206299, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.282399892807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1347.037591934204, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.356481552124, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1332.7947187423706, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.4536037445068, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1370.317120552063, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 958.6412858963013, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1347.9142379760742, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 980.7564830780029, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1312.3680114746094, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.7958307266235, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1353.0207967758179, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.7055912017822, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1315.1753616333008, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 922.0108890533447, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1355.9905529022217, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 957.0036745071411, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1369.8750305175781, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.3732786178589, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1363.5646343231201, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.1299209594727, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1358.3686447143555, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.3976020812988, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1360.46639919281, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.8870277404785, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5400.282554626465, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.2171168327332, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5521.018867492676, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.3126459121704, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5568.6542320251465, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.9088039398193, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5514.854431152344, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.6579208374023, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.400643825531, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.9092836380005, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.1172823905945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.7408003807068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.987361907959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.8251152038574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.79264307022095, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.63056087493896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.01664304733276, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.4689564704895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.8795237541199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.2563228607178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.0180835723877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.7279987335205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.87760066986084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.448956489563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.9009623527527, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.80447816848755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.23360204696655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.8470411300659, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8371243476868, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.03808307647705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.84784173965454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.77503633499146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.5131254196167, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.0177607536316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.0947184562683, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.44032096862793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7126398086548, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.16592264175415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.15376329421997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.04224395751953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.3411202430725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.4599928855896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.8124785423279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.7155222892761, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.070237159729, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.8988814353943, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.5513606071472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6244759559631, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.0236768722534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.5019235610962, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.53408002853394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.3302412033081, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.88192272186285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.9823932647705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.9681644439697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.3988819122314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.43344354629517, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.79311895370483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.5998339653015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.5876798629761, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.80431890487677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.8788814544678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.5124807357788, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.3894371986389, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.73055839538574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.4707221984863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.3148784637451, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.62432050704956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.1174383163453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.94544076919556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.05744314193726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.1419253349304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.6916847229004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.0177612304688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.5499229431152, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.2756838798523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.9065656661987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.7987236976624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.3843207359314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.281277179718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3382396697998, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5774435997009, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.9612793922424, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.6553583145142, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.6577596664429, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.1633577346802, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.129280090332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.0787229537964, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.3304018974304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.0911989212036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.7529635429382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.34832239151, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.1279997825623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.7484831809998, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.6841607093811, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 721.8694376945496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0521626472473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.4187164306641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 683.6155223846436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.5699162483215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.3118405342102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4272003173828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.6577596664429, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 720.1807975769043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.03855419158936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.89216136932373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.84208631515503, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.059841632843, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.6257586479187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.1444821357727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.69184160232544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.1319971084595, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.5099182128906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.04576206207275, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.6878423690796, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.067675113678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.4124798774719, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.1865644454956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.3537669181824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.47711992263794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.89551877975464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.04719400405884, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.06415700912476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.33039712905884, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.64367628097534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.8473539352417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.3446407318115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.8161563873291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.1214408874511, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.36927366256714, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.65040016174316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.42336320877075, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.2849626541138, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.8023986816406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.82015228271484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.87744092941284, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.6052794456482, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.22704553604126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.3409605026245, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.8120031356812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.60399770736694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.67215728759766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.9139189720154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.48047971725464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.4555196762085, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.9766402244568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.34255933761597, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.4798374176025, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.7265558242798, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.6156826019287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.45248222351074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.78463840484625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.96255922317505, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.70303201675415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.1379227638244, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.8209643363952, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.6668839454651, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.48415660858154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.90943670272827, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.9489541053772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.59663677215576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.06143522262573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.68864011764526, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.21823501586914, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.38223695755005, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.2768020629883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.8963165283203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.07456398010254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.8452806472778, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.0979194641113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.28928136825556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.133120059967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.8363146781921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.1732873916626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9248013496399, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.8006434440613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.1726393699646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.2156763076782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.7091193199158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.4611191749573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.8033547401428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.7782368659973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.08592033386236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.2017631530762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.63040351867676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.1135993003845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.91632080078125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.9867219924927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8675165176392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.7299189567566, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.1996831893921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.69919776916504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.27152395248413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.8038401603699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.63552141189575, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.6734414100647, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.7776007652283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.88143730163574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.33904218673706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.5924777984619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.23855781555176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.999520778656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.98703956604004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.47151803970337, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3892774581909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.0939211845398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.9155201911926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.56912183761597, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.78975439071655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.471200466156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.0088028907776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.9056005477905, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3862357139587, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.6009564399719, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.36463832855225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.01903533935547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.49152135849, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.8369626998901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.1068820953369, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.40688276290894, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.7883176803589, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.02847623825073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.8704047203064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.90239763259893, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.94143772125244, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.30656147003174, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.0460777282715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.05200147628784, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.87312173843384, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.0803198814392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.1115174293518, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.60016107559204, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.1436758041382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.8487935066223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.4116792678833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3166465759278, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.97376585006714, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.86064529418945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.5206422805786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.1236791610718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 847.0975923538208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.7028818130493, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.7065601348877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.457437992096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.2985725402832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.5515203475952, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 862.2068881988525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.4817633628845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.8278388977051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.0052766799927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.29439878463745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.99135637283325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.1006426811218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.05664682388306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.82144021987915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.1884789466858, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.77232217788696, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.1883158683777, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.51343965530396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.58128023147583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.4750409126282, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.49439811706543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.6110324859619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.5860815048218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.14816093444824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.30335521698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.56463956832886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.43007802963257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.52992391586304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.34608030319214, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.78256273269653, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.4327988624573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7833614349365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.467041015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.3667178153992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.6975932121277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.4921598434448, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.59343671798706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8545589447021, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.39695978164673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.0692811012268, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.6680030822754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.31728315353394, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.3214421272278, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.8332777023315, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.97855615615845, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.2771215438843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.5403218269348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.5910358428955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.42703914642334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.22720193862915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.9544024467468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.580002784729, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.2416038513183, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.1864042282105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.44112205505377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.9291248321533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.1135983467102, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.61759901046753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.0438389778137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7072019577026, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.9804797172547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.4688024520874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.5473628044128, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.5652766227722, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.69215965270996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.3788814544678, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.264639377594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.4707236289978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.0300760269165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.7937636375427, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4462366104126, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.4332814216614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.5544023513794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.88447237014765, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.3735918998718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.3686370849609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.4799971580506, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4604802131653, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.2886424064636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.9265565872192, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.84047842025757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.41616058349615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.1624002456665, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.0084772109985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.6414394378662, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.529757976532, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.2883205413818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.0571188926697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.7320022583008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.75535821914673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.4038400650024, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.0808029174805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.3692841529847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.6886386871338, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.9918508529663, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.3352003097534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.9107098579407, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.41984033584595, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.1582398414612, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.6043210029602, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.99727630615234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7839941978455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.3747200965881, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.9831981658936, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 783.0198454856873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 814.6391940116882, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.4806361198425, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.4558424949646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.9104022979736, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.9678425788879, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 780.861759185791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 806.8563151359558, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.1468782424927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.4620785713196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.3959937095642, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.197114944458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 787.6835203170776, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 806.4433646202087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.4460830688477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.0347213745117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.1579179763794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.5918412208557, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.4144034385681, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.6856060028076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.6097555160522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.811520576477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.6844806671143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.62672185897827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.2084822654724, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.14895725250244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.05984258651733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.92447328567505, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.32592010498047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.6969618797302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.43584299087524, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.91135931015015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.64559507369995, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.299204826355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.5310368537903, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.66847944259644, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.0955286026001, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.6545567512512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.204158782959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.90400314331055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.0683250427246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.7543969154358, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.7822437286377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.02527952194214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.6473593711853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.94944429397583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.0964789390564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.2278437614441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.7108874320984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.2971200942993, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.6843204498291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.5854392051697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.7180790901184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.2276773452759, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.8510413169861, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2492828369141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.9841604232788, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.8430423736572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.4569606781006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8008027076721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.5592007637024, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.9675207138062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.9864010810852, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9407949447632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.0883193016052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.7487988471985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.992636680603, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.231999874115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.40223836898804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.6191935539246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.2566418647766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.6510338783264, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.30831384658813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.7609601020813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.3787207603455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.8862390518188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.03696393966675, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.1632013320923, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.3855967521667, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.2307195663452, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.70000314712524, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.160481929779, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.5875129699707, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.5564832687378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.75312185287476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0271978378296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.7043237686157, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.6507205963135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 731.5787196159363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.5057654380798, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.6606373786926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.7057681083679, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.0148854255676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.7852735519409, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.0175986289978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.038402557373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.7651171684265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.5705647468567, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.8459167480469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.449761390686, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 731.1942386627197, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.9235215187073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.4508762359619, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.722243309021, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.77120304107666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.40847873687744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.56368112564087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.2615976333618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.3188810348511, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.61120414733887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.4851245880127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.5836796760559, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.206392288208, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.9520010948181, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.75119638442993, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.2505569458008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.92240715026855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.32512187957764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.55615615844727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.8715214729309, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.6545624732971, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.07456731796265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.14319849014277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.6220769882202, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.56127405166626, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.78272008895874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.9639992713928, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.707200050354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.2280068397522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.41199588775635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.3900828361511, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.04543685913086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.68640184402466, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.48640060424805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.52000093460083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.10048484802246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7534456253052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2929553985596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0417609214782, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.243200302124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.2686376571655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.285279750824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.08895874023443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.1867184638977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.8993611335754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1977596282959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4747180938721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.81679582595825, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.6769595146179, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9185547828674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9702391624451, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1036.7870473861694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.5051140785217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1046.2390422821045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.1932787895203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1046.6519975662231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.6742415428162, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1044.543514251709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.4646401405334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.842565536499, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.3147196769714, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.1494421958923, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.9870395660401, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.5614376068115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.566556930542, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.1870393753052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.3379240036011, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.54752016067505, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3619117736816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.29087686538696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.52160024642944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.10031938552856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.83615922927856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.50399875640875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2008056640625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.0345616340637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.93583965301514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.6608033180237, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.1430411338806, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.7654371261596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.9604845046997, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.7700810432434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.4092745780945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.1110486984253, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5150375366211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.0920014381409, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.6753582954407, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.5708770751953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.36879730224604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.6083173751831, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.5787253379822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.9463958740234, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.89552116394043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3484778404236, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.5164813995361, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.8527979850769, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.0395221710205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.773115158081, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.9414367675781, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.9071979522705, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.21087598800665, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.3262405395508, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.1156787872314, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.7318396568298, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.7003231048584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.7531261444092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.2833642959595, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.9310350418091, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.28383922576904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.6716771125793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.9460773468018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.1279988288879, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.6995258331299, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.0324811935425, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.5681557655334, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.7276787757874, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.55279636383057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5289583206177, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.6558408737183, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.7048015594482, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.1683168411255, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.1206426620483, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.4311962127686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.0340805053711, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.0395178794861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.4912009239197, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.6211194992065, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.8147192001343, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.0406403541565, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.2803211212158, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.9638409614563, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.4401597976685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.8587193489075, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.3001618385315, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.1924767494202, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.8880033493042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.6904010772705, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.6627159118652, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.3420805931091, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.2684845924377, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.919517993927, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.5945582389832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.7884798049927, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.8612775802612, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.513120174408, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.4740858078003, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.9084801673889, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 909.161434173584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 929.0564870834351, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 918.5683155059814, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.5052795410156, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 920.7291269302368, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.9278326034546, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 918.1777667999268, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 937.9136037826538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.9291195869446, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.4052758216858, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.24128055572515, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.25631999969477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.0483202934265, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.82959604263306, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7376008033752, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.1316819190979, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3908772468567, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.2968006134033, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4148797988892, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.46095943450933, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.2516841888428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.6539192199707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.5595216751099, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.4401602745056, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.4579219818115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.21792125701904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.1166429519653, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1307225227356, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.6961603164673, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.3313570022583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.8124785423279, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9503984451294, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.8972768783569, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.65936040878296, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.0369591712952, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.5923271179199, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.4888033866882, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.776801109314, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.0369563102722, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.0564775466919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.1297578811646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.0991978645325, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.4423990249634, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.1862421035767, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.2233581542969, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.0364861488342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.5759973526001, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.8592066764832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.3948802947998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.9006423950195, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.9043231010437, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.0724849700928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 637.7334475517273, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.29407787323, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.2705583572388, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.9001603126526, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.31055784225464, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.080641746521, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.94592094421387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.3630361557007, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.5889620780945, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.53664445877075, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.438401222229, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.32239818573, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.243679523468, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.7697548866272, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.72816133499146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.63711881637573, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.7100830078125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.54943990707403, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.1915216445923, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.08943939208984, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.40735673904425, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.4511995315552, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.85279655456543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.65903949737543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.3120036125183, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.11631441116333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.84496259689325, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.3870391845703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.9044780731201, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.9284749031067, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.5910396575928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.5049619674683, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.3652806282043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.4571189880371, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.682240486145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.1683211326599, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.362557888031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.2540826797486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.7601580619812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.8334436416626, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.5731191635132, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9656019210815, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.2577614784241, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.7577600479126, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 919.0977478027344, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.7552013397217, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 756.3060688972473, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.7388820648193, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 921.1959886550903, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.1860752105713, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.2121620178223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 784.9721670150757, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 912.193922996521, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 881.9087982177734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.1528024673462, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.8446373939514, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.306568145752, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 863.3812856674194, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.2852802276611, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.1214380264282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 919.5694446563721, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 888.2473659515381, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.6396775245667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.0584011077881, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 919.3343925476074, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 862.6158475875854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 769.6737670898438, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.6185617446899, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 916.5529584884644, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 879.6939182281494, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.3945579528809, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 771.7225646972656, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.3254356384277, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.9913539886475, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.3937511444092, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.101282119751, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1107.701120376587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.808313369751, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1097.3603105545044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1069.1529560089111, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1097.2671937942505, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1072.7227306365967, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1101.8201541900635, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1068.4824085235596, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.5830397605896, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.7286353111267, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.1070394515991, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.9907169342041, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.9278378486633, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.5656037330627, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.0745630264282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.6241598129272, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.8638386726379, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.1241602897644, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.124321937561, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.6028747558594, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.0137577056885, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.2347211837769, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.9143986701965, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.2072019577026, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.6604838371277, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4764742851257, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.7036843299866, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.5407962799072, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.6921577453613, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.0086326599121, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.0710425376892, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.3737664222717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.4644765853882, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.940477848053, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.0625600814819, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.7523212432861, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 859.0662384033203, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.3638439178467, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 860.406084060669, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.2515215873718, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.1555190086365, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.5910339355469, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 684.4694375991821, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.677282333374, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.7108812332153, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.3068833351135, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.3060822486877, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.1395201683044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.2188830375671, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.4937605857849, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 685.5468845367432, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.8824005126953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.9116764068604, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.9828796386719, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.3017597198486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.6291246414185, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2439.718551635742, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.991039276123, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2439.4761657714844, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.2193632125854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2454.920015335083, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.6356792449951, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2445.2486419677734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.0816016197205, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1823.936471939087, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1275.231523513794, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1822.3835182189941, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1290.6222486495972, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1830.7166481018066, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1270.856966972351, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1848.1737613677979, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1281.9995260238647, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1836.9475078582764, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1274.169602394104, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1849.0423965454102, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1275.0219249725342, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1833.1099224090576, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1269.8171138763428, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1852.1881675720215, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1290.0630378723145, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1864.9940872192383, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.131685256958, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1870.2415943145752, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1160.5334424972534, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1865.4993438720703, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.476643562317, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1875.291519165039, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1149.147367477417, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7354.446449279785, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 950.5054426193237, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7384.832649230957, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.2988796234131, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7363.726768493652, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.7424144744873, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7391.756782531738, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 941.5540885925293, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.32016372680664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.11040592193604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.0401611328125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.5196776390076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.8401570320129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.84799814224243, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.2270412445068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.9315228462219, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.3454418182373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.49183893203735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.6057605743408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.7560033798217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2574367523193, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5294451713562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.8528022766113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5372838973999, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.74928188323975, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.9041647911072, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.9552035331726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.1254391670227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.1179265975952, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.2208008766175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.5699214935303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2286367416382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.62671661376953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.0147213935852, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.85567903518677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.6723184585571, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9484839439392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1278438568115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.3892812728882, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.188485622406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.2982397079468, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.54959869384766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.17919969558716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.74176311492926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.5380735397339, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.51984167098993, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8553576469421, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.9267230033875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.1755218505859, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.81024217605585, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.7123236656189, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.875039100647, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2884831428528, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.2454414367675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1627202033997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.4183979034424, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.1575999259949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.1905603408814, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.01471614837646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9251136779785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6238441467285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5215997695923, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9964814186096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.9159989356995, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.35424184799194, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.0009579658509, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.7408013343811, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4070358276367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9128007888794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.6710343360901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.1092796325684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.8747172355652, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2273578643799, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.5648007392883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.8588833808899, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.5652747154236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.8296022415161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.0372776985168, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.475513458252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.2584013938904, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.0452828407288, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.3939199447632, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.0787215232849, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 721.7812728881836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.9521579742432, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.477276802063, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.2215991020203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.292317867279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.0625629425049, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.0491199493408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.8100819587708, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 724.018075466156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.4343996047974, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.538562297821, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.6363186836243, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.1576051712036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.7463998794556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1646418571472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.2003178596497, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.731041431427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.8990359306335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.5300765037537, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.1078395843506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.646402835846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.4187211990356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.0796785354614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.3464002609253, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.09152030944824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.38496589660645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.57263469696045, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.72128200531006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.10447835922247, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5862398147583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.30191564559937, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.07008266448975, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.86767768859863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.28704071044916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.4027199745178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.2590432167053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.43952369689936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1942372322083, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.5025601387024, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.2945647239685, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2728023529053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3055996894836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.20784187316895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.1124801635742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.3400011062622, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.5700721740723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.28768253326416, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.0075206756592, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.4697632789612, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.1078481674194, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.4284815788269, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.7743968963623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.79567813873297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.62351751327515, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.98047828674316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.0241584777832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2131261825562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.8041577339172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.40640163421637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.745762348175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.9233565330505, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.83440113067627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.34048080444336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.1209607124329, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.0296006202698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.07760620117193, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.8078370094299, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1799969673157, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.5780844688416, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.21727657318115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.5143957138062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6316738128662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9318385124207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.58543443679804, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.25919723510737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2894358634949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6953601837158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.1419196128845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.1596798896789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9310331344604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.1428799629211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.0884785652161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.76831674575806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.9745635986328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.4521622657776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.3798413276672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.4083228111267, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.2011179924011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.2931213378906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.1755175590515, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.6528024673462, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.286075592041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.6196784973145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.4230456352234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.793598651886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.9734449386597, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.7540812492371, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.3628854751587, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.6414403915405, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.1048035621643, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.8438353538513, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.27327823638916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.58848381042475, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.82640409469604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.06848764419556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9545564651489, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.38623762130743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.4519987106323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.9772815704345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.33359813690186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.16911983489985, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.24559783935547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.1257562637329, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.2673554420471, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.1748833656311, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.07024002075195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.60240077972406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.18624353408813, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.2841544151306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.4100818634033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.8857564926147, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9220790863037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.7843179702759, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.0251159667969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1465630531311, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.9468765258789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.80992221832275, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.3422431945801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3614430427551, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.9355239868164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.67568159103394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.02335500717163, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.37167978286743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.1481552124023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.0865626335144, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.0979218482971, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.99455881118774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.10431814193726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.04416322708136, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.1798396110535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5248045921326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2113628387451, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.9947166442871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.09519815444946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.31343841552734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.5028753280639, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.053918838501, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8446383476257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.26303911209106, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.85887575149536, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.3568015098572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.66543912887573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5147185325623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.9619235992432, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.60015535354614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.1684837341309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0785608291626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.3945598602295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.772322177887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.2844886779785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.1121644973755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.3243217468262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.2979230880737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 879.0352010726929, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.9257636070251, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.3798432350158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.85663414001465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2291193008422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.6958384513855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.5795202255249, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.3785557746887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.8054389953613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.8051190376282, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.4955177307129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.29247665405273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.2675223350525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.938720703125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.7769618034363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.69119930267334, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9878416061402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.5391993522644, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.86832189559937, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.4361548423767, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.030080318451, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.3260769844056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.4603247642517, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.88207721710205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.8081603050232, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.141122341156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.9615979194641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.59583663940435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.9275231361389, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5115184783936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.274405002594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.66303873062134, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.7772827148438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.26784181594854, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.76816272735596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.67615461349493, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.55504560470575, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0500841140747, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.5083198547363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.53760051727295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.6824059486389, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.8337631225586, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.7332787513733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.7766456604004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.809280872345, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.06032085418707, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.8980784416199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5847959518433, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.1627125740051, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6747250556946, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.2511959075928, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.84575748443604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.9225640296936, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.25856161117554, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.6576051712036, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4409627914429, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.0369558334351, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2860794067383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.2347197532654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.94143724441534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.21647977828974, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.85231637954706, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.7920064926147, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.40848159790045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.9547171592712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.7339172363281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1564769744873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.6196751594544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.6675143241882, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.7817568778992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.7488050460815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7116847038269, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.1838459968567, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 663.4439992904663, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2912030220032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.82543420791626, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8752059936523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 637.3879957199097, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.2328033447266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.0777540206909, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.4710426330566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.1694407463074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8108806610107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1764750480652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.8641562461853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.2876825332642, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.4054408073425, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.2408022880554, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.1108884811401, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.8824014663696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.7543940544128, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.4923195838928, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.731360912323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.9542369842529, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.3523230552673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.0388770103455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.1299214363098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.7744045257568, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.7681603431702, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.3956661224365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 841.4571237564087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.0318369865417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.1212739944458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.8830389976501, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.6016001701355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.8598299026489, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.2447996139526, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.7948808670044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.3683214187622, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.7462377548218, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.1515188217163, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 814.9081563949585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 842.8244781494141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.7712001800537, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 749.1935992240906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 775.7566404342651, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.9568033218384, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 823.5291147232056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 846.8979120254517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.6675186157227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.6992030143738, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.5499215126038, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.30191802978516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.3907198905945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.8447952270508, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.89888191223145, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.44207763671875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.51328325271606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.1712055206299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.742880821228, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.07775831222534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.4788784980774, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.0584034919739, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.96368169784546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.3220782279968, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.06127643585205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.7980737686157, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4852824211121, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.5212788581848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.2966365814209, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.9958391189575, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.95136404037476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.4340839385987, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.1915183067321, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.3401656150818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.1358428001404, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.3303999900818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.58319902420044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.2697629928589, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.9320030212402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.13200187683105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.73648214340204, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.340163230896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8992013931274, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.681282043457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.4419202804565, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.1244812011719, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.9852862358093, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.8313555717468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.1179184913636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.9057631492615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.0524706840515, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.4046406745911, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1921591758728, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.2057566642761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.1492824554443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.0143957138062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.1697616577148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.709755897522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1302428245544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.1910357475281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.775363445282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.4731216430664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.3780832290649, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.4747142791748, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.5912051200867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.9903993606567, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4051179885864, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.5265560150146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.5177640914917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.7726430892944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.5479989051819, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.1587152481079, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.58608627319336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.2886366844177, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.584801197052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.1860818862915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.9603180885315, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.6439986228943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.8236794471741, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.1798434257507, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.999837398529, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.2233567237854, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.9339208602905, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.9841628074646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.1315212249756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.271044254303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.8460793495178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.4313569068909, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.7401585578918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.2086386680603, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.2854375839233, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.8731212615967, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.9388794898987, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.63968276977545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.3755125999451, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.16239833831787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.0152039527893, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.9955177307129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.3323221206665, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.8127965927124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.4606394767761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.35039854049677, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.2704014778137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.3905658721923, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.89888525009155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.7204756736755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2705669403076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.3353614807129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.965922832489, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.09680032730097, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2867250442505, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.04431581497187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.80560302734375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3481631278992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.3953580856323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.5215964317322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.0185613632202, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0947160720826, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2012805938721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.8695993423462, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.7214369773865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.368001461029, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8206462860107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.92592048645025, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.1172814369202, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6531195640564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.2099194526672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.4550418853759, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.7300815582275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2699198722839, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2220783233643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.82479381561274, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.8185601234436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.5329594612122, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.9403228759766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.7735958099365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.0487985610962, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2185597419739, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.3185596466064, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1062.3534297943115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.0342388153076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1062.4356842041016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.0468873977661, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1064.6155261993408, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.8247957229614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1069.221749305725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.9679985046387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.5446405410766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2796778678894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.528639793396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5260772705078, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.5854444503784, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.81903886795044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.77504158020014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.271999835968, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.0537638664246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.637282371521, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.4011220932007, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.0612750053406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.4062356948853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1631984710693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.0724830627441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.7675175666809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.7327995300293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.287043094635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.4600005149841, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.4291191101074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7305564880371, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.2435231208801, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.0336012840271, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.5094399452209, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.7393579483032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4918365478516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.7483253479004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2278351783752, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.4897546768188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.170557975769, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.619366645813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.4067230224609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.6619181632996, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1164793968201, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.966881275177, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.1822395324707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.7065539360046, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.0073504447937, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.9081635475159, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.2500791549683, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.5897583961487, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.0678377151489, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.0827221870422, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.2992005348206, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.1617631912231, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.2819166183472, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.3295965194702, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.1264038085938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.069926738739, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7318429946899, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8723258972168, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.9729585647583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.588321685791, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.7486357688904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.0950417518616, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.8510394096375, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.8887991905212, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6012873649597, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.9747171401978, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.6312017440796, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.4865670204163, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.2233624458313, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.2689609527588, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.3409585952759, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.3220806121826, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.7095928192139, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.0620818138123, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.854875087738, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.9275240898132, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.8964757919312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.2609572410583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.1161527633667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.1668815612793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.7356848716736, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.5388803482056, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.3942399024963, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.5537557601929, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.7444744110107, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.5902457237244, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.7891201972961, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 748.4006404876709, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.0875182151794, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.783362865448, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.7100787162781, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.2998385429382, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.5044736862183, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.2467193603516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.5801577568054, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.1363124847412, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 976.3267183303833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.2817678451538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.0727949142456, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 963.3452892303467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 977.4545621871948, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 963.6254358291626, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.1567897796631, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.0110397338867, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.50240087509155, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.357916355133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6998448371887, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.6371216773987, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5580787658691, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.1281642913818, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.666081905365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.3694443702698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.26863908767706, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.442883014679, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.0769553184509, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.617434501648, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.4894433021545, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.6171169281006, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.7303972244263, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7929615974426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.7809543609619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.3731241226196, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.0220808982849, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.2668871879578, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3467154502869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.1715245246887, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.7403240203857, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7563166618347, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.7993555068969, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.1385631561279, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.3073592185974, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.0360064506531, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1366353034973, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.89808177948, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.785126209259, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.6876773834229, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.4948830604553, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 722.4427199363708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.443835735321, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.7939209938049, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.6385588645935, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.3308773040771, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.5303983688354, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.4428768157959, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.7195200920105, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 727.3657631874084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.9657597541809, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.6217575073242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.1848068237305, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 727.3036789894104, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.1179213523865, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.55231761932373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.3280005455017, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.1881637573242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.5577549934387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.58288097381586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.5655951499939, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.5694375038147, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7092761993408, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.1460728645325, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.7044768333435, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4916839599609, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.3792014122009, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.02255582809454, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0334372520447, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.7384028434754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.88031673431396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.9892807006836, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.2201590538025, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.9388790130615, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.812961101532, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.6601629257203, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.0335946083069, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3079957962036, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.905601978302, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.0860795974731, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.2968029975891, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.0737609863281, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.2716827392578, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4595184326172, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.4486441612244, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.6582417488098, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.4278411865234, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.4052748680115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.5606355667114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.9928016662598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.3808054924011, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4820823669434, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5819201469421, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.3048024177551, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.6622395515442, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.4428758621216, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 921.077766418457, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 799.976315498352, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.2801599502563, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.2292804718018, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 899.2993545532227, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.6057572364807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.8323135375977, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.9473638534546, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 916.7160034179688, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.7960052490234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 813.1622362136841, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.4372854232788, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 905.6481552124023, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.6884803771973, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.5590372085571, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.9599952697754, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 921.845760345459, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 792.2380781173706, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 806.2897539138794, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 934.5491170883179, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 911.2388849258423, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 802.2751998901367, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.4224004745483, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 933.939037322998, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.9467124938965, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 804.8980808258057, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.1740884780884, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.5601577758789, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 905.476803779602, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.7479944229126, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.7107224464417, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1155.146722793579, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1113.1217670440674, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1161.7456102371216, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1131.181116104126, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1160.6601572036743, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1122.8836727142334, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.5030460357666, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1122.039680480957, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.939359664917, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.340479850769, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.647843837738, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.4166374206543, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.0840015411377, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.9683218002319, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.1275210380554, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.7796859741211, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.9814434051514, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.3476805686951, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.1798391342163, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.1336011886597, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.9857606887817, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.5377550125122, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.1996831893921, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.6595230102539, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.9598388671875, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.800802230835, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.5363230705261, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.1683177947998, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.794882774353, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.5908823013306, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.1895971298218, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.7640008926392, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 878.3763217926025, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.5832018852234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 878.9993667602539, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.490394115448, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 871.176962852478, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.7972755432129, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 881.5416049957275, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.1699228286743, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.9774374961853, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.4067182540894, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 699.2614388465881, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.9470381736755, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.6521668434143, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.412965297699, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 697.3172760009766, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.633599281311, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.1723170280457, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.7616000175476, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 697.5601625442505, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.8230409622192, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 773.2115173339844, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.4091215133667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 699.9127984046936, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.9006400108337, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2502.188491821289, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.8932809829712, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2512.182397842407, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.8836770057678, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2522.870569229126, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.6814341545105, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2532.1879863739014, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.7032008171082, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1920.2425479888916, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1325.732479095459, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1939.4275283813477, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1349.6481609344482, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1926.4311695098877, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1330.7275199890137, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1933.9334392547607, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1356.2022304534912, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1929.1007709503174, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1340.8911895751953, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1943.335371017456, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1348.0604839324951, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1934.2028903961182, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1340.9367942810059, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1940.1208019256592, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1357.8107213974, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1880.2817630767822, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1173.8462400436401, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1879.7217464447021, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1180.1729536056519, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1884.256992340088, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1178.47008228302, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1885.5079936981201, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1177.8739166259766, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7463.56143951416, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 968.3332777023315, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7466.946144104004, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 971.491208076477, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7471.216354370117, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.8998441696167, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7484.053115844727, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 973.820629119873, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.0603189468383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.5432019233704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7871990203857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.6755204200745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4220843315125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.0828757286072, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5691194534302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.6315202713013, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.6953611373901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3081579208374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6657557487488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.8080010414124, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.2745566368103, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.2753643989563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5737566947937, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.0305614471436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.3521590232849, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1528029441833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3487992286682, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.6441602706909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.7936010360718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.6875176429749, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.4356846809387, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.2316741943359, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.8732786178589, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.5558385848999, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8894367218018, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.37440204620367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.27423620224, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.6806406974792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.393274307251, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.640962600708, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.5964832305908, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1604804992676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.3163232803345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.0208044052124, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.7168035507202, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.5582423210144, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.8864040374756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.0966382026672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.8177614212036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1848006248474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7328038215637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.9067215919495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.9183993339539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.1769638061523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.9708814620972, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.9177570343018, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.1201586723327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.42671489715576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.002076625824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.7334394454956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.0566415786743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.1297564506531, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.8737630844116, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.774561882019, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.2361621856689, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.37807846069336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.5304002761841, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.5033626556396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.346079826355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.6851277351379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.3148803710938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.1604771614075, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.3432030677795, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.5134401321411, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.462085723877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.9644742012024, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.9464030265808, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.3315148353577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 846.0407876968384, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.7017545700073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.519838809967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.0739245414734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.843514919281, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.9647979736328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.651035785675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.9764790534973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.2883253097534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 869.1459131240845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.9611196517944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.6940813064575, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.0358452796936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.5174417495728, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.4014358520508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.1568007469177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 840.9540748596191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 864.4908666610718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.457597732544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.672164440155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 799.4681644439697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 812.3716855049133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.0638403892517, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.0976028442383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 849.3897533416748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 863.1347179412842, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.1681609153748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0161519050598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.26015901565546, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.41263484954834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.2406373023987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.15791511535645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.2884788513184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.38400077819824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.9025583267212, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.8307199478149, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.51039934158325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.6041617393494, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.0499234199524, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.53055620193476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7684774398804, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.0728068351745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.6985578536987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1443204879761, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5921635627747, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.7665586471558, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.1644740104675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.7950429916381, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.9515175819397, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.8393659591675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1383996009827, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1119995117188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.6364817619323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.8902368545533, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.31135797500616, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.18607902526855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.68384075164795, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.67280292510986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.08863830566406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1692805290222, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.8734407424927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.3118405342102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.4140782356262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8049626350403, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.9657578468323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.3254432678223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.73616027832037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7931156158447, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.1039929389954, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.989116191864, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.85328197479254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4505615234375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.7296023368835, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.8580803871155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.11791610717773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1356792449951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.4089622497559, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.618236541748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.25583887100225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.5779175758361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.7707147598267, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.7715172767639, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.07519769668573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.34672498703003, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.9995188713074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.6859097480774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.08863830566406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.1902441978454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.9132761955261, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.1409616470337, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.4143967628479, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.8926396369934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.3665609359741, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.7492804527283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.8492832183838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.8615989685059, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.4248023033142, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.3923192024231, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.2492814064026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.6583905220032, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.0414423942566, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.7633609771729, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.3524785041809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.2523250579834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.0996813774109, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.5651183128357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.48128175735474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2774386405945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0020785331726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1777606010437, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.571834564209, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.6465578079224, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.7819232940673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.70928049087524, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.5284829139709, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.9577641487122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.9107189178467, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.6623873710632, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.5209593772888, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.2940773963928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.650399684906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.7259216308594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1737585067749, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.3169584274292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.0273613929749, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.1987223625183, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8983969688416, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2604737281799, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.61439895629877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.5743975639343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.98480129241943, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.410722732544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1321592330933, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7683200836182, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.7859172821045, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2052798271179, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.9408025741578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.12415552139277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.6259245872498, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.5412817001343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.5956816673279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.821605682373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.3931188583374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.1377568244934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2900881767273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1286382675171, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.0137605667114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.1913628578186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.2851228713989, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.5982403755188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6113533973694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.2795233726501, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.7131214141846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.86400222778315, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.9575972557068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.0246353149414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.5750379562378, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.9230403900146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.5692782402039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.7980813980103, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.8292784690857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8326387405396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 911.382246017456, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.5283236503601, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 916.3305568695068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.5051231384277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 911.9937562942505, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.3777542114258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 917.0539140701294, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.0640029907227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.8227243423462, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3945631980896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.1113624572754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.2366423606872, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.46063899993896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.7503967285156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4950385093689, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.5180807113647, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.6387186050415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.33759784698486, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.8115224838257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.2659134864807, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.0312042236328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.320963382721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.0151944160461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.80751609802246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.02127790451044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.5423974990845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.9673624038696, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2340755462646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0611200332642, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.78319644927984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.26656055450434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.9806394577027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.4031987190247, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.402717590332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.9006357192993, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.5092749595642, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.2947187423706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6726412773132, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.2830410003662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.7892818450928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.29776811599737, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.2628793716431, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.9375996589661, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7718396186829, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.5196785926819, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.3888006210327, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2310423851013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.896800994873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9084792137146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.3180751800537, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.5859222412109, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4838376045227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.6188836097717, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.336323261261, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.0769605636597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.9233565330505, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.6889595985412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8731160163879, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.733606338501, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.0943970680237, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.3582458496094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.047679901123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.0582389831543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.2377610206604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.9881567955017, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.3598389625549, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.9878396987915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.3865571022034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.5782384872437, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2952008247375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.5272002220154, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.2681570053101, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.0187201499939, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.5612816810608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.3899164199829, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.1167988777161, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.3593559265137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.8574390411377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.5643219947815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.8384051322937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9324798583984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.8036775588989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.9542379379272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.4033517837524, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.6160001754761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.5244822502136, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.4601588249207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.5915231704712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.3617601394653, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2588787078857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.1147179603577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.4596800804138, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.3079977035522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.7511978149414, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.2209610939026, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.9425644874573, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.1087989807129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.9316835403442, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.9375991821289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 663.3572793006897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.4929623603821, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.7884798049927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.6187205314636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.3569579124451, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.7187194824219, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 841.4267301559448, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 856.6420888900757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.2080006599426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 767.4051213264465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.3054423332214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.6800031661987, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 842.3431968688965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 855.0816011428833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.937593460083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.4742398262024, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 795.1915144920349, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.3919949531555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 846.965274810791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 867.673282623291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.7649598121643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 764.3028783798218, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 797.3545622825623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.5052742958069, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 848.290228843689, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 857.3958396911621, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.0340809822083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.4139151573181, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 795.2583932876587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.8270406723022, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.01232099533075, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.79759502410894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.6967992782593, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.68416070938105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.111361503601, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3140811920166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7512021064758, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.4587211608886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.7281560897827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.1086401939392, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.0422358512878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.16959285736084, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.19583940505987, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.6660771369934, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.3158383369446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.3355212211609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.99631547927856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.6396827697754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.3582434654236, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.6889629364013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.9361605644226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.14399766922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9347257614136, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9889621734619, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.6281623840332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.8576021194458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.6638445854187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.976957321167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.1071991920472, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.9208083152771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.4289622306824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.3260803222656, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.1131181716919, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.492965221405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.710563659668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.795521736145, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.7111916542053, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.9142355918884, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.6327929496765, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.7796816825867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.4331192970276, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.4910373687744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.3044838905334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.5264058113098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.513918876648, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.952793598175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.9996786117554, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.3222427368164, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9145617485046, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.5529561042786, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.8420767784119, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.1195178031921, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.7169585227966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4046363830566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.990243434906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.8153614997864, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.3097653388977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.0239996910095, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.7958421707153, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.9412751197815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2214398384094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3345670700073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.67520570755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.9497632980347, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 760.5332803726196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.3624029159546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.7915177345276, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.4875197410583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.448320388794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.3750386238098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.7713632583618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.5176048278809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.4043173789978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.0036773681641, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.0135998725891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.2551989555359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.0670385360718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.7351975440979, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.5424003601074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.7137637138367, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.2350430488587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.312798500061, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.0489640235901, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.817120552063, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.7508826255799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.2951946258545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4185585975647, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.182719707489, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.45024108886713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4958367347717, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.097442150116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.6062393188477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.6286401748657, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4654388427734, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.8657531738281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1257605552673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.3217673301696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2273635864258, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.8561539649963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3079986572266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.5998420715332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.0371170043945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.5483183860779, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.5024003982544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.47920083999634, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.4395189285278, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.3895998001099, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6382393836975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.0168023109436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5456047058105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.6683201789856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9942359924316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.9411172866821, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.6548810005188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.1296005249023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1164746284485, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.661600112915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.5753617286682, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6084814071655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3305597305298, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.0164804458618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.7988777160645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.0115189552307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.0311970710754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.8952045440674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.0660743713379, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.7788844108582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1071.846718788147, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.9227204322815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1073.27054977417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.7475185394287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1077.9750394821167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.9838395118713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1082.78799533844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.2513628005981, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.9088034629822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.5451216697693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.7721562385559, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.4798412322998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.4169611930847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.4590377807617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.9686427116394, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.2668771743774, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.9203171730042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.9015970230103, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9767956733704, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.41599893569946, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.9601616859436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.8236804008484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.4391965866089, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.3846373558044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.4899158477784, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2964816093445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1459202766418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5657544136047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.7374496459961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.9068813323975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1609625816346, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.8931188583374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.3432011604309, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.2140784263611, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.2542414665222, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.3694443702698, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.9828805923462, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.747838973999, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.6524796485901, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.8112020492554, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.0780849456787, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.8292856216431, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.2932806015015, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.5326428413391, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.5865559577942, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.7563209533691, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.31951379776, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.3539175987244, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.2407975196838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.070234298706, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.3153643608093, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.4620747566223, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.9492840766907, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.7636799812317, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.2308721542358, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.5460758209229, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.0940809249878, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.8001565933228, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.2923202514648, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.1092820167542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.316162109375, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.8847994804382, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.1929640769958, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.6766428947449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.2127981185913, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9396743774414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.9163198471069, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.7244849205017, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.1000027656555, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.6432046890259, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.3168048858643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 697.8276777267456, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.5575909614563, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 736.1916756629944, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.3156781196594, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.1681618690491, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 760.9014439582825, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.5169653892517, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.6296038627625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.6849594116211, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.6320023536682, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.751838684082, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.7249598503113, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.1246409416199, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.2083191871643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.6545634269714, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.7238349914551, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.9833588600159, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.46000623703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.9748768806458, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.6273593902588, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 736.2687969207764, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.9307174682617, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.6622433662415, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 771.6876792907715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.9667205810547, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 985.7188749313354, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1001.572003364563, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 986.2617588043213, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1001.8129587173461, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 983.9632034301758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1008.9296007156372, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 982.8913688659668, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1008.8852834701538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.2398409843445, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7081623077393, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.8484811782837, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5873599052429, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.8672013282776, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.153920173645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.7104015350342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.2427201271057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.4833607673645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2651166915894, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.2048015594482, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.9199995994568, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.8700737953186, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.6121587753296, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.2052764892578, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.724956035614, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.2187190055847, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2000026702881, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.2348818778992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.7931289672852, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1852769851685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.7321586608887, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.602560043335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.2657580375671, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.541437625885, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.257764339447, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.159679889679, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.5339164733887, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.5971202850342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.7740797996521, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.2430362701416, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.6649675369263, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.5172824859619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.4684772491455, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.6668810844421, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.143358707428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.9372806549072, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.7983999252319, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.4020833969116, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.2817621231079, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.4838371276855, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.606876373291, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.8448038101196, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.4238381385803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.5772738456726, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.9470405578613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.1017565727234, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.5865573883057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.616641998291, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.2857613563538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.0148758888245, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.3879985809326, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.8740773200989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.887363910675, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.52383756637573, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.3430366516113, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1487998962402, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.8084759712219, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.0822372436523, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.7280020713806, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.91200256347656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7006411552429, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2462372779846, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.051203250885, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.9691200256348, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6312012672424, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.7724823951721, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.3990454673767, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.6767983436584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.2070407867432, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5502390861511, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.9102444648743, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.374400138855, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.2923183441162, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.8424010276794, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.711042881012, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.529757976532, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.2929558753967, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.2484798431396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.4345560073853, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.8401598930359, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5531277656555, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.2745552062988, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.687361240387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.0030460357666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.8001623153687, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.0979166030884, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.6596803665161, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.8708782196045, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 933.3428907394409, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 806.6062498092651, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.8475284576416, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.0080099105835, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 920.7534551620483, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.7260751724243, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 826.9105577468872, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.6179246902466, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 944.681601524353, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 816.4420700073242, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 833.6480093002319, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.1675186157227, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.2828874588013, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 812.500467300415, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 829.5297622680664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 943.0897617340088, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 946.083517074585, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.0067253112793, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 836.5208101272583, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 940.1592016220093, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 928.2118368148804, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.1395235061646, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 823.1782293319702, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 941.5383958816528, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 949.1524744033813, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 812.5603246688843, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.959997177124, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.2051267623901, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.173749923706, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 809.9843168258667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 828.2486486434937, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1178.4764766693115, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1130.32320022583, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1186.4342308044434, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1136.905426979065, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1182.4462461471558, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1138.4574460983276, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1184.6959972381592, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1137.0959901809692, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.3907208442688, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.6041626930237, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.4972853660583, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.6107177734375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.5347218513489, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.3566408157349, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 686.3699221611023, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.3311982154846, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.8641610145569, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.5596785545349, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.007040977478, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.1008014678955, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.029914855957, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.5635209083557, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.5428824424744, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.0348806381226, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.082558631897, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.2852754592896, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 686.1870408058167, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.2113585472107, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.5435152053833, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.8268756866455, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.6777606010437, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.5371189117432, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 883.8755130767822, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 764.1984009742737, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 895.3737640380859, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 767.0913600921631, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 887.8481578826904, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.7481570243835, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 894.3195199966431, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.0977635383606, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.8739252090454, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.1700768470764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.6553606987, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.7859153747559, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 780.9352016448975, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.0686411857605, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.4377593994141, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.7073659896851, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 783.101761341095, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.9715209007263, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.5652809143066, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.4166445732117, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.0716853141785, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.2886385917664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.1583938598633, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.3241620063782, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2521.7492961883545, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.7732825279236, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2530.7081508636475, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.0651245117188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.6868801116943, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.168155670166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2553.4073638916016, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.295684337616, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1943.988962173462, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1345.328164100647, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1949.3907451629639, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1361.799349784851, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1949.2251110076904, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1346.7343950271606, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1955.9214305877686, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1368.5054445266724, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1952.2484683990479, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1354.713110923767, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1957.517900466919, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1372.2995233535767, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1952.1230792999268, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1352.1305513381958, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1958.2923316955566, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1372.8006410598755, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1865.508975982666, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1185.5326461791992, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1865.8688163757324, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1190.4919862747192, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1867.5232028961182, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1188.845591545105, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1871.9436836242676, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1190.1843166351318, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7485.676460266113, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 976.5776109695435, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7486.932640075684, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 971.8083143234253, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7492.872543334961, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 970.3041553497314, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7513.126907348633, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 977.3367929458618, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.0385522842407, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.8475217819214, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.2756776809692, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.4318375587463, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1097.4406433105469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.076476097107, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1042.0361518859863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1065.3776025772095, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.7980847358704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.800323009491, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.2513580322266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.0417566299438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1086.9603252410889, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.87824344635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1068.0244779586792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1090.270881652832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.4615964889526, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.2632012367249, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 716.8827199935913, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.6560020446777, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1094.0374422073364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1058.231987953186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1060.8553504943848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1104.711675643921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 689.6995186805725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.4683265686035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.254876613617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.7734365463257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1090.0416040420532, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.1808004379272, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1068.3619260787964, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1093.8209676742554, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.2345566749573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 759.2289614677429, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 838.7801551818848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 971.5772771835327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1048.3521604537964, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1026.6931247711182, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1091.0323190689087, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1234.908151626587, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.1403198242188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.4292802810669, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 837.9812812805176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 974.6336030960083, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1048.88032913208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.3462371826172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1093.2025575637817, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1229.9054336547852, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 721.9833564758301, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.1385598182678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 835.3201770782471, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 973.6555194854736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1046.0542345046997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.1334390640259, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1088.440146446228, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1228.6219310760498, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.4796800613403, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.0143985748291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 836.3923168182373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 965.1556777954102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1049.1747093200684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1026.3545608520508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1084.7732782363892, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1237.6918363571167, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 992.0294380187988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1112.4440050125122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1639.2615985870361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1693.176622390747, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1284.312801361084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1281.3150358200073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1837.6454257965088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1877.070083618164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1000.7039928436279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1105.8030366897583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1652.9142379760742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1679.1923236846924, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1287.8228902816772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1284.1814374923706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1829.1524696350098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1878.6201763153076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1002.0670366287233, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1107.559208869934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1646.8219089508057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1691.1127853393555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1302.864327430725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1271.6780757904053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1829.8835182189941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1877.126865386963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1010.4499101638793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1111.6452836990356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1626.6140604019165, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1701.1876964569092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1311.0716772079468, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1289.9491214752197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1828.4331321716309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1878.756332397461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.7854385375977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.6414394378662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.7289657592773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.1070423126221, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.9587235450745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.6572847366333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.9707188606262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.4900779724121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.7484803199768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.5270400047302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.261438369751, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.9302401542664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.4255986213684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.6576008796692, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 683.5809588432312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.5174403190613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.8935985565186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.2814407348633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.9567999839783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 722.4324798583984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.4417643547058, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.6425614356995, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 683.450882434845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.0975980758667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.4260764122009, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.4814395904541, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.8704028129578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.3076801300049, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.0713586807251, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.9750428199768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.8915209770203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.9763164520264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.6404790878296, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.9841585159302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 966.28737449646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 973.398232460022, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.9844861030579, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.8084750175476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 988.4087991714478, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.3472099304199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.8305583000183, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.2603168487549, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.8867177963257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 969.8819208145142, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.8454384803772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.9564805030823, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 962.2441625595093, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1000.7641649246215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.2964754104614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.5947179794312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 935.3392028808594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 962.4923229217529, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 718.6081647872925, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 748.2320046424866, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 962.2454452514648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1006.4216041564941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.0060791969299, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.2631993293762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 923.5367965698242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 965.030722618103, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.9232001304626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.3897614479065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.1070442199707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 990.4635190963745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 976.5908813476562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1307.876968383789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.9668741226196, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1143.5702323913574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.4764785766602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1294.7480010986328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 858.624005317688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1150.023045539856, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.3345623016357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1303.1838464736938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 857.4694442749023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1149.169921875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.4745588302612, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1308.8129568099976, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 848.7980842590332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1142.75887966156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.397759437561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.963041305542, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.9703960418701, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.7083191871643, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.1889634132385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.5752000808716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.2022399902344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.457441329956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.4924812316895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.1862373352051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.4017615318298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.4884757995605, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.5337543487549, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.0561580657959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.9014372825623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.5244817733765, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.3443269729614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.3439984321594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.4753613471985, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.5897603034973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.072639465332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.4684782028198, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.6260833740234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.2465620040894, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.7089614868164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.0382509231567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.1743969917297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.7763223648071, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.905125617981, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.6401629447937, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.8627166748047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 731.5620803833008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.9873633384705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.725435256958, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 869.6388721466064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.3446378707886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 786.9830417633057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 794.2708778381348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.727518081665, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 837.8744029998779, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 824.7273540496826, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.3700828552246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 771.9555163383484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 768.9867234230042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.4281620979309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 836.9027185440063, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 827.0968055725098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.0323195457458, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 771.158881187439, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.3134398460388, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.259684085846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 827.6155185699463, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.9415969848633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.2118453979492, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 769.3417620658875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 768.8142418861389, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1696.726245880127, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1095.3347253799438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1690.3497505187988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1012.583203315735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1687.821445465088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1016.8332862854004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1680.4811096191406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1016.026406288147, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.8689570426941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 718.7489604949951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.7094402313232, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.6454372406006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.6563220024109, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.649760723114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.165602684021, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.4902367591858, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.8849611282349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 676.0153603553772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.7264060974121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.930558681488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.1463971138, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.0579223632812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.0336036682129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.7243223190308, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.3604731559753, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.7030358314514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.768479347229, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.6012787818909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.1963195800781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.5147228240967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.9017601013184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.2670373916626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 908.1606340408325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 856.5286493301392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 879.2572736740112, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.7097587585449, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 881.1470413208008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 816.742889881134, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 877.0390462875366, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 809.639356136322, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.5998406410217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.9131183624268, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.7036828994751, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.5089664459229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 802.1657562255859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 722.7801632881165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 734.0424060821533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.6350388526917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.8772802352905, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.1888031959534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.6971211433411, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.5595231056213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 783.0963206291199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.9271984100342, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.0092821121216, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.9528021812439, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.4572839736938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.0369625091553, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.7460803985596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.8643155097961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.1459245681763, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.2678399085999, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 731.0860800743103, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 758.3113598823547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.0057597160339, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.3947229385376, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.6780800819397, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.7806429862976, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 782.7446389198303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.9985599517822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.7785639762878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.8047943115234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.6348867416382, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.4726419448853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 736.4572787284851, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1042.372169494629, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 782.5702381134033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.8771224021912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 830.9343957901001, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1109.3494319915771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.0964789390564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.7124767303467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.6393599510193, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1027.21200466156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 782.3151993751526, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.1779174804688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.4380836486816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1104.9745559692383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.8883213996887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.8678402900696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.7575988769531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1022.3705530166627, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.6702418327332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.3361625671387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 802.0587277412415, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1118.440637588501, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.4745554924011, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.5785574913025, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.1488018035889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1038.227367401123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 781.5470385551453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.268159866333, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 806.988480091095, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1101.4806365966797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 950.0555229187012, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1310.5729627609253, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1369.9715328216553, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 958.7740802764893, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1227.5124883651733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1264.5772743225098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 954.484486579895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1321.5358352661133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1373.0083227157593, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.434720993042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.0912103652954, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1272.1817636489868, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 940.1297616958618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1315.9067153930664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1377.9009437561035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 954.7270393371582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1204.5113611221313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1271.9702434539795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.7540836334229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1308.7649631500244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1371.9257545471191, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 949.5920038223267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1208.7875175476074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1279.6580696105957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.9225625991821, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.5030355453491, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.1164793968201, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.9249606132507, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.0934371948242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.1435217857361, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.5502362251282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.4315180778503, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.2571177482605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.6041569709778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.6470394134521, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.0974354743958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.4431972503662, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.4862418174744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.6593637466431, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.4078392982483, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.9287972450256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.9838366508484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.5484828948975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.9659194946289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.932954788208, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.4839997291565, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.2364768981934, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.5532846450806, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.3433589935303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.0244832038879, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.6204776763916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.126874923706, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.3841619491577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.6316757202148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.3606395721436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.306236743927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.8017630577087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.1667213439941, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 775.5905604362488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.4523162841797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.2667217254639, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.483362197876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.90895652771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 783.9528012275696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.753598690033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.25727891922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.4220838546753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.037919998169, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.8118419647217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.4139165878296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.2171149253845, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 785.7353639602661, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.7551975250244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.2155237197876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.6702446937561, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.327362537384, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.3673620223999, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.6428823471069, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.5719995498657, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.8832001686096, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.5694408416748, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.6315197944641, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 758.9847946166992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.5702438354492, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 685.7436776161194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.151674747467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.9441566467285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.57967710495, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.0300731658936, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1167.2147226333618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1013.7425661087036, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 902.6828861236572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.4001607894897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1180.5481576919556, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.2075262069703, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 901.6348791122437, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.3236880302429, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.938554763794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1022.9980802536012, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 896.7939186096191, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 809.9787211418152, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1171.892008781433, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1028.0190324783325, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 893.1148767471313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.4886426925659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.6440000534058, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.2859253883362, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.1676826477051, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.7723145484924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.1627197265625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.645441532135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.4489598274231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.156955242157, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.5331196784973, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.1622338294983, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.8876781463623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.1713590621948, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.9614410400391, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.0238375663757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.7977585792542, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.2014403343201, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.9959983825684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.0420851707458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.8334455490112, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.3577599525452, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.0859212875366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.3745636940002, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.1040000915527, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.1937599182129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.3727989196777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.7201552391052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.9391975402832, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.7889609336853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.3163194656372, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.1294393539429, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.8649578094482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.2844815254211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.6995224952698, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.5838370323181, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.9513621330261, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.3766403198242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.0257635116577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 676.814079284668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.0062355995178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.0303955078125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 685.895037651062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.6356806755066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.6977553367615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.1334414482117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.0091199874878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.6371192932129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.4497594833374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1468.002233505249, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.894889831543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1499.080638885498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.6987228393555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1503.625283241272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.1233558654785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1509.8982429504395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 816.5620756149292, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.7532801628113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.4809622764587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.292640209198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.4980754852295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.4260830879211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.1028738021851, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.7606468200684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.2907228469849, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.5968012809753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.5480027198792, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.0720009803772, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.5219230651855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.4859156608582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.3099131584167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.9305653572083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.4486413002014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.1262426376343, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.6668844223022, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.0465569496155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.9127988815308, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.4727964401245, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.6990361213684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.1230397224426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.3364825248718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 923.6857509613037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.2328023910522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 919.8958349227905, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.1665639877319, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.2558460235596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.5401582717896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 928.5524845123291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.236319065094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.6393632888794, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.5728001594543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.1913542747498, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.3193593025208, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.6684856414795, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.3065605163574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.0428791046143, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 724.2015957832336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.1747217178345, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.2009625434875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.031834602356, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.4167985916138, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.1892762184143, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.6353578567505, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.8255987167358, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.3033609390259, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.1230430603027, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.8726420402527, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.6031999588013, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.0008015632629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.5527997016907, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.4808020591736, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.4268751144409, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.0497584342957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.675678730011, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.5540781021118, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.8072023391724, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.7966361045837, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.5872020721436, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.8929567337036, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.705436706543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.0990471839905, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.7598390579224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.2548713684082, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.3843216896057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.6667141914368, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 832.005124092102, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 796.3936042785645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.6142435073853, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 799.5707249641418, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.0219202041626, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.6415944099426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 826.2671899795532, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 807.5595235824585, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.5057621002197, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.8775968551636, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.7739253044128, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.4758358001709, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 824.1686344146729, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 813.2708883285522, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.5820846557617, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 803.3567953109741, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 760.6950426101685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.9761538505554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 831.2022352218628, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 824.2894315719604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1034.8675298690796, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1076.834397315979, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1033.165111541748, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1080.110559463501, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1036.677598953247, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1071.342568397522, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1042.8159952163696, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1076.5801572799683, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.6777558326721, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.9086394309998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.0649647712708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.6707172393799, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.0371150970459, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1881642341614, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.5228805541992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.407518863678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.1622333526611, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.0038371086121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.4260754585266, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.2479968070984, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.6116786003113, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.41952085495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.3054394721985, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4022407531738, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.2796859741211, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.0583939552307, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.5739192962646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.6763167381287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.7742366790771, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.5411267280579, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.3063945770264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.3121604919434, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.4526419639587, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.7542443275452, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.4134411811829, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.8220868110657, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.557918548584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.8028812408447, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.9056015014648, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.7588810920715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.8371257781982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.4841628074646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 796.2043261528015, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.6214365959167, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.9256019592285, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.8385629653931, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 795.064799785614, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.0252842903137, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.1275177001953, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.3449602127075, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 800.9657621383667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.6259236335754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.9332809448242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.4929566383362, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 805.6567907333374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.897759437561, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.1820769309998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.79807472229, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.8779211044312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.330237865448, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.0456070899963, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.7457642555237, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.7774424552917, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.6425533294678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3000011444092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.3555235862732, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.4048018455505, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.995677947998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.1623992919922, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.880482673645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.5331153869629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.3972868919373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.8963179588318, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.6204743385315, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.6216015815735, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.6313576698303, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.3305568695068, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.3137636184692, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.923357963562, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.0307250022888, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.0262379646301, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.4380788803101, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.4899139404297, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.6489672660828, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.3430376052856, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.3649625778198, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.8425641059875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.3886432647705, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.3171191215515, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.084005355835, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.9067215919495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.9126448631287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.4215984344482, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.4398412704468, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.7977557182312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.0243263244629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 954.6132850646973, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 983.7791872024536, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.0809669494629, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.9438381195068, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 947.2068691253662, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.7796764373779, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 835.5209541320801, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.000955581665, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 954.2955207824707, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.6404790878296, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.1310415267944, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.5799932479858, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 946.4985513687134, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 964.4591951370239, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 831.9080018997192, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 841.8902492523193, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.1108884811401, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 996.4556884765625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.5036773681641, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 859.8441696166992, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 948.9329624176025, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 969.2694330215454, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.833607673645, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.882402420044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 964.3443155288696, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 998.8769674301147, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 862.4568033218384, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.3609409332275, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.7219305038452, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 973.5979223251343, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.3876829147339, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.493595123291, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1247.4396705627441, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1196.6036748886108, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1253.0902481079102, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1203.191204071045, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1259.3569564819336, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.327359199524, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1262.1897602081299, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1210.5126333236694, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.1572790145874, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 689.6748733520508, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.9174389839172, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 686.6585659980774, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.3212814331055, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.4791984558105, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.0124807357788, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.5126419067383, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.4780807495117, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.6030468940735, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.6009602546692, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.299684047699, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.5684823989868, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.6448016166687, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.4639978408813, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.6072058677673, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.9710445404053, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.3824005126953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.6932830810547, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.6959962844849, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.5201606750488, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.6991958618164, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.1486377716064, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.62624168396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 924.7968101501465, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.4439988136292, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 929.9596834182739, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 800.6940770149231, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.9561529159546, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 803.1054401397705, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 928.0075168609619, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.3619208335876, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.538405418396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.9500823020935, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.8307242393494, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.3406448364258, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 812.0356750488281, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.1200032234192, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.8224005699158, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.6519947052002, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.1145706176758, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.0332770347595, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.5864043235779, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.0294361114502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.8027200698853, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.3356828689575, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.9905648231506, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.3902406692505, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2633.9260864257812, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.5137639045715, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2653.2561588287354, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.3406429290771, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2671.504011154175, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.7409543991089, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2676.8460655212402, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.1438336372375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1986.9647979736328, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1382.1910429000854, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1993.8891220092773, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1394.860486984253, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1995.9868907928467, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1387.7804803848267, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2005.5671977996826, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1404.0884685516357, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1998.2713508605957, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1390.7809686660767, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2006.6262435913086, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1403.6855936050415, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2000.3953552246094, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1385.6865692138672, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2011.1102676391602, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1401.5947246551514, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1879.7185611724854, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1219.4129657745361, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1878.4107398986816, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1224.373435974121, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1885.1270198822021, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1227.7771139144897, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1892.499189376831, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1234.9451208114624, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7550.028991699219, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 995.6875085830688, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7541.615829467773, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 998.9518451690674, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7552.024002075195, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1006.8326473236084, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7590.907745361328, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1004.2790222167968, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2325.2532863616943, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2296.137933731079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2365.2686405181885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2490.0865650177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3382.0705795288086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3328.2355308532715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3364.321632385254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3459.93408203125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2300.016326904297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2301.934242248535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2368.136806488037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2471.312484741211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3389.721736907959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3397.2830390930176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3444.3581008911133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3542.7696228027344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2287.9634952545166, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2306.846227645874, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2372.6742362976074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2473.692150115967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3400.797920227051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3404.9420738220215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3436.693572998047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3542.7255821228027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2277.599334716797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2302.4955081939697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2368.753261566162, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2461.8998622894287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3392.7294158935547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3403.560676574707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3439.5680046081543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3539.375057220459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2369.559679031372, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2488.936014175415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2749.2487812042236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3197.5519943237305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3345.940628051758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3309.0960121154785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3521.715679168701, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4038.7622451782227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2348.0203247070312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2466.6383838653564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2696.7115211486816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3143.889112472534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3351.6892623901367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3307.6143836975098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3513.3383560180664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4006.330547332763, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2350.494394302368, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2457.4228858947754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2722.036647796631, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3140.5150413513184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3346.654853820801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3311.755790710449, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3511.783981323242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4017.422027587891, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2339.5408153533936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2447.5622177124023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2712.127857208252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3127.0948791503906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3350.947322845459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3311.8330001831055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3508.85347366333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4016.8814468383794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3278.179054260254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3665.1455879211426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5271.618709564209, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5403.523712158203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4104.797115325928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4187.295684814453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6038.239364624023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6186.793899536133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3291.3073348999023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3584.54158782959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5321.689910888672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5469.034729003906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4076.164455413818, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4172.8862380981445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6044.528961181641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6216.968612670898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3341.503086090088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3582.666721343994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5337.356605529785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5500.024948120117, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4104.877471923828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4180.167636871338, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6074.777431488037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6231.737880706787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3335.456771850586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3566.8740463256836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5343.808937072754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5500.974235534668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4102.738361358643, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4181.780014038086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6076.649761199951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6225.860290527344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1962.5072002410889, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.7179107666016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2076.2822341918945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2298.2894229888916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2183.277425765991, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2096.8153762817383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2161.693925857544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2338.120641708374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1856.0558605194092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1898.8311862945557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1980.9948635101318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2200.7787227630615, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2191.509437561035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2007.4171161651614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2067.5657653808594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2255.6475162506104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1842.3112106323242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1887.1766376495361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1982.7358436584473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2195.7697582244873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2180.8512210845947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1979.1953563690186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2053.30096244812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2263.948497772217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1829.9763298034668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1877.648983001709, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1973.911533355713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2179.156322479248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2159.1721534729004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1967.6132678985596, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2047.0966434478762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2262.039031982422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2076.0449600219727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2408.923215866089, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3096.556329727173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3132.0811080932617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2250.733766555786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2355.063190460205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3062.4307250976562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3213.1310176849365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2000.9524917602537, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2306.1835193634033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2765.110397338867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2922.087516784668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2225.341272354126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2221.8268871307373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3034.3875122070312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3171.729145050049, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2001.184320449829, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2300.3471851348877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2771.8313598632812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2931.5155124664307, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2233.65008354187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2222.94864654541, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3042.3180961608887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3191.5511798858643, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1989.2753219604492, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2291.641607284546, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2773.565902709961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2929.502239227295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2218.038396835327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2214.0537548065186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3035.0564861297607, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3182.324962615967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3193.9623737335205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4151.05598449707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2707.0990562438965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3603.6607933044434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3053.8827228546143, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4110.69278717041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2620.7814407348633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3602.5545501708984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3066.490068435669, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4123.734569549561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2618.026714324951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3613.1824111938477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3052.3884677886963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4118.141632080078, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2604.8092937469482, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3612.070083618164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1822.4259090423584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1874.323844909668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2015.3076648712158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2319.6843242645264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1903.288974761963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1827.4851322174072, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1902.9497337341309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2261.23646736145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1751.5752220153809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1790.8558368682861, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1863.1598567962646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2037.8790569305418, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1859.9424266815186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1760.6377506256104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1853.454704284668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2193.4414291381836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1737.6414394378662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1779.098720550537, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1850.3230381011963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2037.8512001037595, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1853.8227272033691, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1748.0233573913574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1850.3932857513428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2187.209596633911, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1732.4563312530518, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1776.4003276824951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.5606327056885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2046.5270519256592, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1846.9128227233887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1739.2776107788086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1839.2604732513428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2190.531349182129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1924.9524688720703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2712.617120742798, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2740.6832122802734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2072.7961921691895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2395.9286403656006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2441.562900543213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1862.3895931243896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2307.3287963867188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2353.3585262298584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1972.4908828735352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2182.807836532593, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2259.4164752960205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1856.1369514465332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2304.610252380371, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2343.8676834106445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1969.2479801177979, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2197.0961380004883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2255.193281173706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1850.4859161376953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2301.2273597717285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2338.0764961242676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1963.9484786987305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2194.8512077331543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2248.5825538635254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5152.507476806641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3510.769805908203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5001.980152130127, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3111.588478088379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5021.744918823242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3100.3017807006836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5033.472805023193, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3095.7464027404785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1899.7974395751953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1989.0390491485596, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2155.0366401672363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1821.1745738983154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1757.2502517700195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1873.7697505950928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1796.9629001617432, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1787.0764827728271, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1792.626085281372, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1696.8927955627441, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1660.7660675048828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1697.2890949249268, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1783.3083248138428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1789.6009731292725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1797.2740745544434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1672.0020771026611, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1654.3060684204102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1677.418556213379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1793.548812866211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1792.2803211212158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1782.6718521118164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1662.3835182189941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1659.9935913085938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1679.461441040039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2749.5820713043213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2582.4508666992188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2474.4275283813477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2281.0542488098145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2492.0643424987793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2294.744634628296, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2490.7241439819336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2291.5478515625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1648.95601272583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1632.6124954223633, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1698.1339168548584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1813.8212776184082, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2105.393114089966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1983.3755207061768, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2031.1073875427246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2109.0611267089844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1683.9328002929688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1686.5449619293213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1752.660322189331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1875.2216148376465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2051.291847229004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1966.0094547271729, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2028.983039855957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2110.2267169952393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1700.674877166748, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1689.3655967712402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1758.1743907928467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1870.62593460083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2049.725294113159, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1975.6360054016113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2030.1319885253909, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2113.5256004333496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1703.7476921081543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1687.9036903381348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1756.5970993041992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1874.8363208770752, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2054.7152042388916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1972.1846294403076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2017.085762023926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2112.240810394287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1884.852647781372, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1812.8759860992432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2065.0228881835938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2825.2681636810303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2098.7470531463623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1988.223533630371, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2228.924627304077, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3039.3081378936768, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1938.3608150482178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1848.5539150238037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2081.45263671875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2842.858896255493, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2141.501922607422, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2002.2487735748289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2205.667200088501, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3079.0238189697266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1939.9072074890137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1843.797435760498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2082.3193645477295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2848.2295989990234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2139.2094230651855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1998.5345554351807, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2220.647678375244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3078.7990283966064, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1933.6048030853271, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1843.5536003112793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2075.6279945373535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2857.696475982666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2131.0843181610107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1994.0660572052002, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2208.0428886413574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3088.5987091064453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2494.1913509368896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3575.256824493408, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3731.499786376953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2524.120969772339, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3368.1519889831543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3493.7064170837402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.136486053467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3594.0499687194824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3755.416603088379, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2597.4507331848145, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3355.1088333129883, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3512.6542472839355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2541.4427375793457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3603.0031967163086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3766.8772888183594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2598.8531494140625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3362.179698944092, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3514.2855644226074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.0563106536865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3607.271041870117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3767.915325164795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2596.689920425415, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3368.184814453125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3514.6094703674316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1466.7193603515625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1315.921926498413, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1363.7137603759766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1489.0710401535034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1563.353443145752, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1410.4839992523193, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1455.1815938949585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1589.663519859314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1472.6380681991577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.1510362625122, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1373.6737632751465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1484.6588850021362, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1563.9300870895386, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1420.680809020996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1462.9547309875488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1590.7044792175293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1474.281120300293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.7479982376099, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1372.403998374939, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1484.756326675415, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1556.3782405853271, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1420.3697633743286, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1459.2766427993774, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1596.321930885315, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1464.4580745697021, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1319.2007970809937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1369.5771169662476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1478.9636850357056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1551.454553604126, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1415.4937601089478, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1457.6584100723267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1592.440013885498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1782.2732830047607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1570.7563304901123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2008.481435775757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2061.759834289551, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1740.7611274719238, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1565.6084775924683, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2070.2472019195557, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2120.11905670166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1801.8465518951416, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1579.83247756958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1994.480333328247, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2071.0822200775146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1755.3910636901855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1549.2913484573364, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2041.7860889434817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2127.9401779174805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1801.5620708465576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1576.0849714279175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.2606525421143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2084.2167949676514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1752.4904155731201, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1547.5499153137207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2045.433759689331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2117.548942565918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1792.079210281372, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1572.1385622024536, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.6809558868408, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2078.2939434051514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1754.0628719329834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1538.94784450531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2046.2993526458743, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2124.567337036133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2197.849750518799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3193.8487911224365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2662.544479370117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2404.2737674713135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2154.6292972564697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3192.3340702056885, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2675.840139389038, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2419.6913719177246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2160.6534671783447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3193.9822578430176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2671.11008644104, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2421.355199813843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2164.7603130340576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3193.477268218994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2682.7462482452393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2421.277904510498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1262.282075881958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1201.5086364746094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1422.7737617492676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1465.2592086791992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1411.0251140594482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1211.991844177246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1426.995348930359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1457.250075340271, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1260.540795326233, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1204.4737720489502, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1410.3353548049927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1414.775996208191, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1445.5942392349243, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1215.5359888076782, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1445.9395265579224, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1455.5521726608276, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1259.032006263733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1202.6862335205078, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1409.5865678787231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1415.9948873519897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1458.6974430084229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1213.9878416061401, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1438.3478355407715, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1459.9865627288818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1255.8263969421387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1198.6267185211182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1411.665916442871, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1416.563835144043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1455.6579303741455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.1926383972168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1438.5614347457886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1456.0063982009888, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1599.780478477478, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1674.1366481781006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1665.4646492004395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1638.871350288391, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1568.3475160598755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1652.6535987854004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1678.5615921020508, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1625.1198530197144, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1568.5704040527344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1654.5060920715332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1675.6305503845215, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1630.347981452942, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1558.626732826233, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1653.868808746338, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1675.4491329193115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1637.54798412323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3723.2805252075195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2257.303991317749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3780.8773231506348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2106.2124824523926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3792.2976875305176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2102.2619342803955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3819.5900917053223, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2096.620168685913, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1272.847843170166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1321.9567966461182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.2099304199219, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1636.2148666381836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1142.5174236297607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1166.6289710998535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1239.5072078704834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1303.6526489257812, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1302.2471952438354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1705.6990337371826, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1180.5571269989014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1189.1742515563965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1238.2996797561646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1306.4175939559937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1309.250078201294, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1708.7465476989746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1183.8438367843628, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1185.7729530334473, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1227.967824935913, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1300.3692817687988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1311.3686323165894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1683.2121562957764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1183.9692735671997, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1182.3622417449951, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2217.439832687378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1681.178903579712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2210.387706756592, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1578.6976099014282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2194.725122451782, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1576.905426979065, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2222.7692699432373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1566.0979223251343, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1391.3511991500854, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1326.030879020691, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1394.6120071411133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1702.0334434509277, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1469.5316743850708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1418.9839887619019, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1492.0841598510742, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1889.5489692687988, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1465.7860803604126, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1405.6204748153687, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1473.4236860275269, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1732.0811176300049, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1514.0839862823486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1469.3316745758057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1560.0097703933716, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1913.1121635437012, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1468.4047985076904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1407.507038116455, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1477.4684762954712, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1734.9180698394775, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1505.677604675293, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1465.8600044250488, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1551.3670349121094, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1911.4406299591064, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1472.2489643096924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1406.6676807403564, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1475.433759689331, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1738.7915229797363, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1506.9355249404907, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1463.9012813568115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1554.9022483825684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1917.3750305175781, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1627.5609731674194, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1889.9177742004395, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1858.1926345825195, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1848.3979225158691, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2052.4164867401123, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2084.5350456237793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1703.5097694396973, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1926.6430377960205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1928.4545707702637, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1876.5331077575684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2061.3913536071777, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2096.091833114624, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1698.87375831604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1943.1470489501953, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1924.9762916564941, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1876.066541671753, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2075.983829498291, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2122.257432937622, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1705.8124732971191, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1953.1328105926514, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1918.957290649414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1878.0246257781982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2080.8945655822754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2123.6931324005127, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2629.4921493530273, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2685.147657394409, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2702.979507446289, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2819.9939155578613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2699.8854446411133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2820.817451477051, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2721.4918422698975, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2827.228488922119, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.5131225585938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1037.5414419174194, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1220.5158424377441, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1223.4491205215454, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1152.209768295288, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1065.4771184921265, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1388.2999992370605, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1394.9768114089966, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1084.7675275802612, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1066.4967966079712, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1241.1831998825073, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1210.3449630737305, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.6248006820679, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1088.4873628616333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1395.546236038208, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1402.5727939605713, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1084.226884841919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1067.0102453231812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1246.3929605484009, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1215.821294784546, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1173.4830379486084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1084.2636823654175, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1388.6584043502808, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1385.0230360031128, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1081.43967628479, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1064.3951988220215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1236.0428762435913, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1219.1348791122437, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.184947013855, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1084.03968334198, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1391.0931253433228, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1387.564001083374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1641.020975112915, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1325.3019332885742, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1895.453462600708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1529.0574502944946, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1713.3363246917725, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1349.8348760604858, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1972.0801734924316, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1509.5516729354858, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1719.552812576294, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1342.7391862869263, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1983.0603313446045, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1520.4592037200928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1719.7990322113037, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1343.1979084014893, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1978.858060836792, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1513.9113664627075, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 924.419846534729, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1015.1006412506102, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 984.9289560317993, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.1942405700684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 998.3777523040771, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.9843111038208, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.6153650283813, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.9428758621216, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 978.4814262390137, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 951.4891242980957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1012.5552034378052, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 970.5814456939697, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 934.7078466415405, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1026.7471981048584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 983.0995178222656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 949.6609592437744, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1015.2871990203857, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 974.1088008880615, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 935.1436853408813, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1021.562876701355, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.0430450439453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.3177680969238, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1009.8452806472777, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 972.2926330566406, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1321.053442955017, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1291.3400077819824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1347.4276781082153, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1307.739839553833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1346.5800046920776, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1306.2964820861816, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1350.1857662200928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1312.5079917907715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1218.6171197891235, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1038.818564414978, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1231.9736003875732, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.2744035720825, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1244.9772882461548, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1063.8995265960693, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1258.5089683532715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1064.3272066116333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1641.5592002868652, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1737.5435066223145, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1553.2987213134766, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1575.1219129562378, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1630.8640050888062, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1689.3091297149658, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1540.603518486023, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1544.8532819747925, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1666.7198276519775, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1775.0424194335938, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1585.210394859314, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1601.134557723999, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1660.9853076934814, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1726.457748413086, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1563.943510055542, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1587.3844861984253, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1668.0526447296143, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1786.3716888427734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1583.89967918396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1608.2689571380615, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1663.1523132324219, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1749.2984008789062, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1564.810242652893, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1584.557285308838, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1676.6187191009521, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1791.620798110962, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1585.8390522003174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1604.8867177963257, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1668.8651371002197, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1741.0388660430908, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1550.6168031692505, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1593.7572717666626, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2199.7723293304443, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2105.3575897216797, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2271.6017532348633, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2201.5503787994385, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2278.3329582214355, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2194.853754043579, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2277.670087814331, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2203.658227920532, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1149.4248008728027, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1201.0124731063843, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1114.7900915145874, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1109.6756744384766, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1131.1153602600098, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.0936031341553, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1161.6267204284668, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1212.163200378418, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1123.1193590164185, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1124.148645401001, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1149.4571208953857, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1058.7790203094482, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1162.6384019851685, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1211.0390329360962, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1120.8777523040771, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1129.510407447815, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1147.9241609573364, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1063.5831928253174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1174.305911064148, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.2958402633667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1129.4958400726318, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1135.2355241775513, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1154.0955114364624, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1058.0985641479492, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1639.860315322876, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1420.9270286560059, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1669.1107082366943, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1458.9315223693848, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1670.8129501342773, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1453.2006549835205, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1672.5340747833252, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1462.3507118225098, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1355.4844760894775, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.0417585372925, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1215.4121589660645, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 895.5651235580444, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1386.5420818328857, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1022.2633600234985, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1249.7433519363403, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 901.5065574645996, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1401.9988918304443, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1030.3695964813232, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1235.3534317016602, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 898.5804748535156, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1402.422399520874, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.8955125808716, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1244.788475036621, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 908.5278415679932, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4880.365619659424, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.4084749221802, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4853.734874725342, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1020.901927947998, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4877.472496032715, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1040.196795463562, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4915.046195983887, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1046.9940662384033, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2231.3545513153076, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1594.9457597732544, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2256.235990524292, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1611.6910457611084, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2252.9142475128174, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1625.1665496826172, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2273.8785552978516, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1639.686689376831, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2256.4112091064453, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1625.8951950073242, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2277.885446548462, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1642.6534271240234, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2279.5150470733643, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1631.4622402191162, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2299.007034301758, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1662.7503967285156, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1952.05246925354, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1384.4667196273804, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1963.1043338775635, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1399.660964012146, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2000.750379562378, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1400.0662326812744, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2086.5492725372314, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1417.965440750122, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8151.043663024902, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1137.5107192993164, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8101.377296447755, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1147.5318479537964, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8126.138877868651, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1155.546555519104, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8187.997932434081, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1172.4057626724243, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4501.772003173828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4476.681880950928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4595.272674560547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4824.999027252197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6491.373138427734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6451.748886108398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6519.189758300781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6694.749221801758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4445.0982666015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4513.853130340576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4607.033271789551, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4758.890552520752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6519.580955505371, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6620.930976867676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6660.732536315918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6863.767547607422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4385.213603973389, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4488.21346282959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4589.728145599365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4719.666290283203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6550.609931945801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6623.67244720459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6695.572891235352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6868.200759887695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4349.499378204346, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4481.035556793213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4566.498260498047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4689.734401702881, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6534.305877685547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6595.338973999023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6661.113128662109, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6858.8945388793945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4602.259044647217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4837.270107269287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5330.09407043457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6223.501930236816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6476.223373413086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6443.577041625977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6813.443222045898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7823.028411865234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4516.089916229248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4762.248134613037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5140.965576171875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6062.722225189209, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6474.412040710449, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6450.106506347656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6828.327674865723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7763.375625610352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4491.726551055908, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4710.494422912598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5138.614902496338, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6004.417095184326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6473.797721862793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6457.386703491211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6795.108413696289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7782.135162353516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4452.567825317383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4676.65132522583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5112.550506591797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5957.404079437256, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6474.9260330200195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6447.209930419922, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6796.417465209961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7806.230392456055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6332.819709777832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7135.026893615723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10207.952690124512, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10461.305046081543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7903.491401672363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8124.693450927735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11747.277526855469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12013.495712280273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6241.043319702148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6917.86865234375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10258.932838439941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10536.24095916748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7833.250579833984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8106.434211730956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11732.614250183105, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12013.905944824219, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6245.468158721924, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6870.0947189331055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10290.87963104248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10575.134773254395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7885.059509277344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8121.0345458984375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11767.666091918945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12055.329704284668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6243.568458557129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6835.126037597656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10312.058029174805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10589.687614440918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7896.106338500977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8122.471122741698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11756.274185180664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12066.599807739258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3793.312530517578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3828.1289863586426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4013.2694244384766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4440.006904602051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4149.987201690674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4081.1844825744624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4182.929744720459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4452.187042236328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3523.9007568359375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3603.867988586426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3802.4729537963867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4189.268283843994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4114.420166015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3937.844982147217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3997.6576042175293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4250.606575012207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3431.3582038879395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3543.093090057373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3768.696460723877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4149.169750213623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4069.345607757568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3769.8972702026367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3991.5707397460938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4240.783576965332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3385.4092407226562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3507.2334480285645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3736.8077087402344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4120.274543762207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4071.1187171936035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3779.008140563965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3910.220470428467, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4233.52819442749, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3967.543830871582, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4669.752979278564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5991.007862091064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6032.094097137451, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4294.205303192139, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4483.299198150635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5904.977283477783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6238.525276184082, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3837.508964538574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4345.179653167725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5309.876937866211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5604.533748626709, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4191.069240570068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4218.344631195068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5845.201778411865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6097.564868927002, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3794.8515129089355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4336.195507049561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5316.790561676025, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5619.11600112915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4163.519535064697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4168.171539306641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5857.406425476074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6119.179668426514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3754.4336128234863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4299.7100830078125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5316.496448516846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5622.731857299805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4150.717430114746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4162.398376464844, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5858.257732391357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6120.5659103393555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6175.119705200195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8049.147415161134, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5206.583499908447, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6975.384483337402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5865.055198669434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7960.373420715332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4995.047397613525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6953.708953857422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5860.794429779053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7963.560791015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5025.825271606445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6950.38818359375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5831.2006187438965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7974.543342590332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5018.5211753845215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6963.239974975586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3511.988945007324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3601.803379058838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3880.9281730651855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4476.836833953857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3647.3213958740234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3526.692314147949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3649.1643142700195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4374.498043060303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3328.064708709717, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3378.2118225097656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3554.215850830078, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3834.5805168151855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3489.000473022461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3372.9222297668457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3561.5489959716797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4130.829277038574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3264.5939445495605, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3322.091064453125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3520.376625061035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3820.1326751708984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3445.870590209961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3332.6084327697754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3524.8219108581543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4128.100337982178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3232.072582244873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3296.494083404541, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3494.9300575256348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3819.941749572754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3415.5188941955566, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3307.6962089538574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3500.6948471069336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4127.466907501221, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3685.0516510009766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5334.110870361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5270.118885040283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3945.6153678894043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4625.023670196533, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4706.414222717285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3490.0711822509766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4267.937145233154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4407.587203979492, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3694.404468536377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4087.0539855957027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4233.083534240723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3445.5129432678223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4267.350101470947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4374.905776977539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3669.6294593811035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4086.6883659362793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4229.765281677246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3425.679397583008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4261.112632751465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4378.818035125732, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3669.196300506592, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4090.2126121521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4229.567489624023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9747.15045928955, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6737.411727905273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9281.757926940918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5949.799861907959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9250.796775817871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5940.980663299561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9325.881004333496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5935.764141082764, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3639.725818634033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3866.80477142334, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4113.134059906006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3515.4472160339355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3507.6281929016113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3597.716808319092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3372.3005867004395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3266.4663696289062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3320.0749015808105, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3252.4467277526855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3069.560136795044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3151.444139480591, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3343.192958831787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3266.6601753234863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3302.898406982422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3184.32767868042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3060.3897380828857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3122.3334407806396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3354.7313690185547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3264.2067527770996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3299.488925933838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3152.651844024658, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3049.893922805786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3127.1352100372314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5217.597427368164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4897.487201690674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4590.868148803711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4204.952335357666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4624.66064453125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4243.705749511719, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4650.420455932617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4257.638244628906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3118.718252182007, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3088.4513664245605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3228.6662673950195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3433.58154296875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3766.613426208496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3607.0473289489746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3705.599250793457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3869.3489265441895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3205.5910301208496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3174.9499320983887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3300.84716796875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3504.136791229248, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3701.3529777526855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3630.0294303894043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3729.504623413086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3860.379066467285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3223.0859375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3172.332019805908, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3299.848003387451, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3501.541748046875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3716.8892860412598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3623.1249809265137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3718.340435028076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3857.16495513916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3225.405445098877, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3168.0207920074463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3287.7656173706055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3495.246696472168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3722.9819297790527, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3623.827476501465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3765.1364517211914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3869.289722442627, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3466.744804382324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3419.8892784118652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3874.754066467285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5302.344951629639, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3880.952663421631, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3643.459529876709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4094.6449661254887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5723.1366539001465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3581.3921546936035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3455.153121948242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3870.364513397217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5342.457275390625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3942.040557861328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3695.1289558410645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4058.352012634277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5768.589763641357, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3583.4195518493652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3446.4673233032227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3868.288993835449, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5359.234390258789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3942.243881225586, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3693.241901397705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4050.8732604980473, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5783.154544830322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3594.0830039978027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3432.964973449707, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3857.1899032592773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5359.710693359375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3938.7405014038086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3692.9572677612305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4045.120162963867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5794.701290130615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4703.193759918213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6730.025863647461, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7034.223175048828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4736.618900299072, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6349.532470703125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6558.7089920043945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4776.399993896484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6765.701751708984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7063.69930267334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4804.255352020264, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6335.873928070068, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6592.266006469727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4789.127044677734, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6776.029205322266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7080.133476257324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4868.7005043029785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6334.608936309814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6615.4522705078125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4789.416313171387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6781.78581237793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7088.938446044922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4863.076515197754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6337.70658493042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6600.793609619141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2720.5924701690674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2483.000135421753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2557.7582454681396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2807.9740715026855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2887.930555343628, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2642.5675296783447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2729.7830390930176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2942.9398441314697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2714.682083129883, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2481.077461242676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2546.5123176574707, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2770.3656101226807, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2895.2795219421387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2607.7709007263184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2708.1854248046875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2939.1771125793457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2714.30495262146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2478.200340270996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2546.5188694000244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2772.149600982666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2881.8273735046387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2604.538402557373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2704.7207927703857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2940.7270431518555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2694.829921722412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2463.8759994506836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2545.979347229004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2763.0006408691406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2867.0900535583496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2610.8263969421387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2702.8379344940186, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2941.325922012329, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3304.3449783325195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2946.0659313201904, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3681.593132019043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3774.7881507873535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3171.1609649658203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2925.623025894165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3788.6430168151855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3931.0774993896484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3327.6171684265137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2923.1019115448, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3621.9423866271973, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3753.1246376037598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3165.8862495422363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2857.4206352233887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3717.0596885681152, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3865.8922958374023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3334.316005706787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2924.689598083496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3620.1977729797363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3770.849952697754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3176.141757965088, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2844.6001625061035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3720.8348655700684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3876.608295440674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3328.4422492980957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2904.8958492279053, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3614.6722984313965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3763.5817527770996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3183.7396717071533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2835.171184539795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3723.5833740234375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3868.010883331299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4164.863548278809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6000.172481536865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4880.343532562256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4500.551853179932, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4057.3843002319336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5990.242042541504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4934.727382659912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4516.934585571289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4065.2088165283208, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5999.169750213623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4946.841106414795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4526.129913330078, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4067.943077087403, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6001.995334625244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4967.490997314453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4524.410858154297, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2329.4815921783447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2254.2701053619385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2658.5729789733887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2751.10577583313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2524.0403175354004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2273.901767730713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2633.316650390625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2693.74080657959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2323.999032974243, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2201.540126800537, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2587.860336303711, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2608.096332550049, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2547.7801513671875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2203.648633956909, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2632.353458404541, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2671.669120788574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2317.3291301727295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2192.876319885254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2584.748487472534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2614.9190425872803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2570.39870262146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2208.9948749542236, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2629.9943828582764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2675.008478164673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2308.3233642578125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2189.1340732574463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2578.7911891937256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2599.204158782959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2569.041585922241, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2208.9455795288086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2635.618886947632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2671.4688396453857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2987.042074203491, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3056.736011505127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3008.3004760742188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2971.368474960327, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2881.4502239227295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2984.9358463287354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2952.034397125244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2931.6843223571777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2864.007501602173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2988.917293548584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2983.052167892456, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2929.928960800171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2858.8468837738037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2982.063512802124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2966.4201641082764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2931.808490753174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6786.209259033203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4189.369297027588, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6801.674156188965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3874.8891258239746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6815.809783935547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3881.0673904418945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6844.6452713012695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3891.468029022217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2265.273609161377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2397.8164863586426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2418.663845062256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2885.739040374756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2054.7451210021973, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2186.054229736328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2173.60990524292, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2342.2713661193848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2374.5987224578857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2956.7659187316895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2123.346529006958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2159.929599761963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2144.9292850494385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2342.829303741455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2371.6324615478516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2988.371047973633, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2117.5566387176514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2162.1068954467773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2130.33935546875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2333.969268798828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2377.947368621826, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2985.001745223999, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2125.984516143799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2162.61775970459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3956.224308013916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2983.755865097046, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3867.3764419555664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2749.0939140319824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3878.766269683838, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2768.2540893554688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3888.159713745117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2770.9875106811523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2484.5174503326416, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2374.1059017181396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2539.62703704834, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2870.965929031372, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2581.8467235565186, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2532.2443294525146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2655.8878326416016, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3146.6694355010986, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2662.3427200317383, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2554.0430450439453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2682.876787185669, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3022.5736045837402, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2697.899351119995, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2644.7134399414062, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2758.72145652771, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3216.949586868286, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2692.0255851745605, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2567.3808097839355, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2686.133451461792, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3010.448799133301, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2698.8494396209717, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2647.906713485718, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2774.6555137634277, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3212.5121212005615, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2699.0243339538574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2562.335367202759, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2680.5871772766113, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3009.105110168457, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2693.1356811523438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2642.436475753784, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2773.9401626586914, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3224.5347118377686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2914.9937629699707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3213.763484954834, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3195.325756072998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3100.2360248565674, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3518.490734100342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3605.5848503112793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3126.227045059204, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3347.797565460205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3368.3764839172363, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3192.1017742156982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3548.5290908813477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3611.4699363708496, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3145.4678440093994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3370.285243988037, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3372.3502349853516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3200.5257606506348, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3559.3400382995605, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3610.050106048584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3136.630268096924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3415.272846221924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3368.572940826416, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3204.909152984619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3591.006908416748, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3615.6894493103027, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4395.183029174805, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4496.532001495361, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4660.5768394470215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5022.982711791992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4740.288944244385, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5031.897106170654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4859.23770904541, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4996.597099304199, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1837.0198440551758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1847.7694511413574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2085.965919494629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2163.585786819458, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1974.6622467041016, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1893.5058879852295, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2340.477924346924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2382.181463241577, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1877.7051258087158, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1887.201747894287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2059.0100860595703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2104.0542602539062, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2019.3516826629639, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1909.5312023162842, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2363.4921550750732, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2375.039358139038, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1874.095516204834, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1888.3601570129395, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2072.476644515991, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2095.943841934204, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2019.9107265472414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1898.3524703979492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2372.332181930542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2383.620481491089, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1872.3390483856201, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1884.1473484039307, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2087.944812774658, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2105.9988689422607, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2013.5446262359617, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1886.812505722046, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2378.599843978882, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2380.7683277130127, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2775.089111328125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2277.700490951538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3207.937431335449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2549.154224395752, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2907.276153564453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2255.669937133789, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3343.5718154907227, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.282398223877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2934.0635204315186, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2262.919521331787, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3355.393753051758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.775676727295, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2961.8897247314453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2277.4760246276855, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3375.8998489379883, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2550.0185775756836, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1565.0550413131714, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1646.5396690368652, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1675.9718418121338, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1606.018214225769, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1627.3336029052734, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1635.5547332763672, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1586.362247467041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1650.9264183044434, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1584.6915197372437, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1628.764820098877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1653.036642074585, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1560.931043624878, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1574.4308805465698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1652.5438404083252, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1599.7606468200684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1625.2513694763184, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1654.4591999053955, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1568.3742380142212, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1571.705436706543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1653.3078384399414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1605.5446290969849, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1622.8382301330566, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1659.5971393585205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1579.017915725708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2166.190881729126, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2123.4655952453613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2211.9351863861084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2147.7110385894775, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2219.3915271759033, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2153.4294605255127, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2228.9200019836426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2167.0872020721436, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1966.635971069336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1737.1729850769043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2008.9507389068601, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1778.6039733886719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2019.129123687744, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1781.6024017333984, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2029.2740917205808, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1782.5032043457031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.0310287475586, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2666.646890640259, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2471.5545558929443, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.7724628448486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2532.98752784729, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2597.9281425476074, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2434.9660682678223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2513.2185554504395, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2705.142068862915, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2823.927993774414, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2657.149305343628, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2678.7892627716064, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2674.005756378174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2743.731346130371, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2597.96462059021, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2627.9177570343018, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2704.514560699463, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2824.0627479553223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2649.4464015960693, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2679.812641143799, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2675.7040119171143, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2755.8220958709717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2577.6774406433105, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2647.2836875915527, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2714.5683193206787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2838.8308811187744, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2642.5849628448486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2716.9918537139893, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2686.0366249084473, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2765.557279586792, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2579.8083114624023, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2658.470239639282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3431.207695007324, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3298.8564682006836, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3651.2719917297363, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3542.6519775390625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3669.569263458252, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3562.3171615600586, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3684.4561767578125, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3579.6459007263184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1751.9115161895752, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1850.3244972229004, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1764.5791816711426, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1708.96879196167, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1747.1977710723877, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1674.3115043640137, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1815.3483200073242, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1897.80366897583, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1804.3208026885986, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1775.0193786621094, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1806.7799949645996, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1726.9966316223145, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1816.215991973877, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1896.871042251587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1812.8505611419678, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1771.8852710723877, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1807.6800060272217, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1733.6508750915527, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1821.3190460205078, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1906.015043258667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1816.7331218719482, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1778.5372734069824, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1813.5104084014893, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1739.2193603515625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2545.1257705688477, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2166.2945556640625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2622.8647994995117, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2274.1382598876953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2633.0740547180176, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2281.5617656707764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2651.868963241577, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2294.6596717834473, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1978.2046031951904, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1526.226725578308, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1776.9603061676025, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1344.694414138794, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2063.998727798462, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1531.3457584381104, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.1820621490479, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1348.5841608047485, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2066.7555046081543, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1536.5121603012085, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1852.3998069763184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1350.2444696426392, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2092.8347206115723, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1556.6660737991333, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1864.3438339233398, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1353.5115242004395, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6773.487854003906, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1424.6358346939087, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6673.906211853027, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1453.8748836517334, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6728.685111999512, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1456.8828773498535, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6761.390686035156, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1472.1300888061523, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3791.300811767578, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2728.6046409606934, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3810.3568077087402, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2764.152822494507, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3982.0172691345215, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2759.61199760437, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4016.6202926635738, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2780.1638317108154, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3986.1649322509766, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2757.867670059204, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4030.6641769409175, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2795.4072093963623, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4012.9912948608403, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2762.097930908203, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4059.8062133789067, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2789.9479961395264, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3559.799041748047, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2370.920629501343, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3588.8417625427246, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2467.440481185913, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3609.7025299072266, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2469.7105503082275, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3722.0630645751953, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2487.5911903381348, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14228.325958251953, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1904.384651184082, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14099.525146484375, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1947.159194946289, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14190.080337524414, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1940.0942420959473, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14338.91616821289, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1979.7633647918701, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8921.394233703613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8802.946014404297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9057.368621826172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9526.80866241455, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12715.99422454834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12744.107322692871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12912.143478393555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13213.87321472168, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8757.198257446289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8826.296920776367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9035.49201965332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9384.94197845459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12826.043853759766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13063.629684448242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13176.929473876953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13547.579498291016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8562.346229553223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8751.338386535645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8932.509841918945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9215.843048095703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12884.165267944336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13066.070175170898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13105.992965698242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13587.589263916016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8442.51392364502, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8688.856315612793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8834.5552444458, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9102.604522705078, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12871.039352416992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13059.170684814453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13189.483337402344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13590.643157958984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9061.984252929688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9576.330604553223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10495.016326904297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12295.045127868652, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12813.764152526855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12702.4361038208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13477.784576416016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15426.7919921875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8857.227821350098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9389.389457702637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10090.14087677002, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11900.856666564941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12820.61538696289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12721.660537719727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13451.278076171875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15264.25537109375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8689.59270477295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9208.91586303711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9898.239784240723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11677.989959716797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12820.185546875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12756.160278320312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13455.092391967773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15398.92219543457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8581.95785522461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9075.857429504395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9794.788780212402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11617.53776550293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12823.20816040039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12718.7788772583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13457.485580444336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15387.900390625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12477.331886291504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14081.559143066406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20124.072494506836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20633.803253173828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15470.345306396484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 16005.506439208986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23119.6044921875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23613.29620361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12175.71418762207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13572.221145629883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20156.85791015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20669.051666259766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15368.141555786133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 16007.25715637207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23097.970123291016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23648.968658447266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12014.785919189453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13399.975509643555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20195.578079223633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20713.53401184082, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15402.4267578125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15997.926177978516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23131.79039001465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23689.181365966797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11965.028533935547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13307.84194946289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20258.31329345703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20784.565353393555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15465.474090576172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15989.4140625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23174.753875732422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23728.900299072266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7546.294174194336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7568.195343017578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7923.584327697754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8749.089050292969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8156.69376373291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8025.545616149902, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8213.760108947754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8755.23151397705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6950.532646179199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7112.708511352539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7482.588653564453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8287.855911254883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7990.293960571289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7578.736877441406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7790.513763427734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8375.339813232422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6740.573768615723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6941.6357421875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7350.207786560059, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8133.879165649414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7889.267845153809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7540.074844360352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7835.670738220215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8361.945991516113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6580.885848999023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6851.1309814453125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7268.830757141113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8079.184684753418, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7842.289123535156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7507.826461791992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7775.049247741699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8308.173217773438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7817.067337036133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9218.370704650879, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11759.391288757324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11810.13786315918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8414.02816772461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8783.094100952148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11585.993309020996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12300.49674987793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7541.93473815918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8485.203742980957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10423.292846679688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11003.731269836426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8146.733779907227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8287.874145507812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11492.636642456055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11990.254974365234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7391.43123626709, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8313.518524169922, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10433.409767150879, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11014.082832336426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8043.992576599122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8194.612579345703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11498.256568908691, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12000.086059570312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7293.100166320801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8318.657836914062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10443.184051513672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11035.109596252441, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7988.163070678711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8183.712196350097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11509.198417663574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12019.485397338867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12139.013366699219, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15865.266647338867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10217.134170532227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13726.988296508789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11457.092399597168, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15657.847671508789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9824.922752380371, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13647.091827392578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11340.995063781738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15649.838333129883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9744.701538085938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13659.579238891602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11291.09058380127, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15675.315170288086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9724.14836883545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13675.612258911133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6971.566390991211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7113.076438903809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7648.276901245117, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8820.225677490234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7229.855995178223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6963.268165588379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7202.675132751465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8622.655181884766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6504.2303466796875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6584.717979431152, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7016.745948791504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7520.47248840332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6859.061660766602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6674.165191650391, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7014.051818847656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8101.257133483888, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6339.448337554932, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6413.9667320251465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6943.759346008301, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7514.387397766113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6747.604789733887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6609.2072677612305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6952.904357910156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8100.79490661621, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6278.879680633545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6360.822906494141, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6900.798492431641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7515.477447509766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6684.498138427734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6521.629772186279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6897.267150878906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8096.141395568849, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7281.971702575684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10587.998847961426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10339.899940490723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7742.278938293457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9166.457901000977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9231.913146972656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6824.395790100098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8372.547225952148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8600.730628967285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7255.618019104004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8035.645332336426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8318.728866577148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6702.249336242676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8365.238456726074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8584.620590209961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7159.531440734863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8029.272994995118, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8313.734893798828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6661.586112976074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8377.605934143066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8609.520874023438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7152.236480712891, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8027.976570129395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8317.921905517578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 19114.003982543945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13265.303421020508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17829.020767211914, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11673.614959716797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17603.336029052734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11601.312675476074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17561.44073486328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11599.622917175293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7192.3652267456055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7864.924049377441, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8102.756042480469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6971.127395629883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7029.443130493164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7102.93643951416, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6697.6287841796875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6398.336429595947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6489.144706726074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6449.186134338379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5987.6518630981445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6105.659523010254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6545.23551940918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6390.511150360107, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6478.512191772461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6296.991806030273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5969.944667816162, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6084.979248046875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6510.368194580078, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6382.5983810424805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6471.653060913086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6211.999187469482, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5970.493412017822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6107.236766815186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10194.469947814941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9579.85164642334, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9062.497444152832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8250.183715820312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9135.401916503906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8306.723518371582, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9164.434585571289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8349.431533813477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6121.373729705811, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6090.390205383301, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6353.202228546143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6753.126373291016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7294.752655029297, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7045.125885009766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7218.518829345703, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7486.556282043457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6254.959506988525, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6212.639198303223, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6448.1660079956055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6837.817077636719, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7171.026039123535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7075.902557373047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7259.60994720459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7493.586235046387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6241.973743438721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6197.01473236084, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6431.444911956787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6808.745765686035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7193.955230712891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7066.846237182617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7258.921318054199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7492.333564758301, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6230.764503479004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6199.041404724121, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6458.719806671143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6806.267929077148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7212.549591064453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7148.083686828613, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7307.613067626953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7545.159759521484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6729.412612915039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6685.7780838012695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7571.610336303711, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10270.978965759277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7515.844802856445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7138.77742767334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7922.033538818359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11108.142204284668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6868.645668029785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6707.935218811035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7496.704216003418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10306.874313354492, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7572.888145446777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7205.68416595459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7890.619659423828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11146.778182983398, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6881.10237121582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6668.924942016602, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7475.246238708496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10353.437767028809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7569.817924499512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7162.161865234375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7877.142066955566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11191.718864440918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6920.898704528809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6696.620178222656, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7438.426780700684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10335.148887634277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7605.741882324219, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7227.794189453125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7880.672836303711, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11194.249229431152, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9092.849006652832, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13065.132369995117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13673.971862792969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9176.814422607422, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12326.59294128418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12724.54574584961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9136.300964355469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13081.178359985352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13677.448806762695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9162.568740844727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12296.844215393066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12764.417724609375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9241.964797973633, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13118.581771850586, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13715.647201538086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9291.53938293457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12299.37858581543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12792.325477600098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9259.955673217773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13144.612884521484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13740.750503540039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9291.710815429688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12300.87875366211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12773.201866149902, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5242.7069091796875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4901.962261199951, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5063.720798492432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5439.4683265686035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5579.942569732666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5271.495342254639, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5415.397090911865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5692.954082489014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5232.861309051514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4865.835494995117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5024.890384674072, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5351.528491973877, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5565.492839813232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5200.496368408203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5363.077774047852, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5708.000183105469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5220.245895385742, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4853.94157409668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4998.821907043457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5347.2536277771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5551.515197753906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5169.713611602783, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5344.7577476501465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5707.862224578857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5228.631858825684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4847.395648956299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4995.970230102539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5356.981258392334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5588.656024932861, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5217.306385040283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5371.806106567383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5757.880477905273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6374.928455352783, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5721.066875457764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7129.097137451172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7331.862869262695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6120.778541564941, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5677.575969696045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7334.541664123535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7632.723159790039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6367.702560424805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5563.396320343018, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6996.033744812012, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7236.088905334473, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6037.512664794922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5594.335498809814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7228.446426391602, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7462.286186218262, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6387.316837310791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5550.875873565674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7007.556991577148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7255.130271911621, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6111.2566566467285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5567.767181396484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7211.662521362305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7479.581184387207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6367.520980834961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5573.444633483887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7009.959182739258, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7256.384353637695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6131.035995483398, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5559.947204589844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7206.420783996582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7470.553131103516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8085.631370544434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11637.265968322754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9331.630821228027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8726.62338256836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7806.331748962402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11600.500411987305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9464.11075592041, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8722.527809143066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7812.524185180664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11622.32608795166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9484.233207702637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8741.754837036133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7826.434783935547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11628.552474975586, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9588.116226196289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8744.411087036133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4491.222190856934, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4425.335216522217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5109.991054534912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5280.927200317383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4831.8500900268555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4541.970062255859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4993.255176544189, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5140.538215637207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4453.301086425781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4286.255798339844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4835.577583312988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4900.55534362793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4821.284484863281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4346.50016784668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4982.926731109619, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5070.86238861084, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4441.084957122803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4240.1971435546875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4846.314373016357, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4904.797763824463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4864.926738739014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4392.61100769043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5000.529594421387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5082.287502288818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4434.372482299805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4230.29691696167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4841.675186157227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4902.29024887085, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4846.726722717285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4387.500782012939, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5005.563011169434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5075.220642089844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5815.10124206543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5965.20622253418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5757.874736785889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5741.0747146606445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5578.766269683838, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5770.555324554443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5509.973278045654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5658.3909034729, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5547.990741729736, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5775.943870544434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5542.866897583008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5665.225315093994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5526.2873458862305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5775.366554260254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5504.351215362549, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5663.755855560303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12964.179992675781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8015.7244873046875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12815.894813537598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7490.6207275390625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12833.862991333008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7495.19718170166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12879.483184814453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7530.080261230469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4275.015678405762, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4513.440628051758, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4638.223667144775, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5394.699821472168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4033.9766693115234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4261.799182891846, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4064.9882888793945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4321.758270263672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4393.400497436523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5475.376110076904, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4015.00093460083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4076.5660667419434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4004.2167663574214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4328.8886642456055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4389.354667663574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5572.663154602051, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3990.904312133789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4087.2072219848637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3955.257110595703, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4314.093475341797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4385.994548797607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5557.371349334717, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4011.101741790771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4089.041652679444, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7387.913932800293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5673.145446777344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7068.588218688965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5149.834060668945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7103.826675415039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5171.249618530273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7122.047653198242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5179.616603851318, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4721.483535766602, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4626.976776123047, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4942.101402282715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5558.856315612793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4936.938877105713, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4886.602687835693, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5091.270713806152, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5883.175048828125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5072.849521636963, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4927.640323638916, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5139.209613800049, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5769.662113189697, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5161.816825866699, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5080.3424072265625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5285.80623626709, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6066.2321853637695, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5139.783153533936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4937.02672958374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5139.164962768555, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5752.380294799805, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5195.106887817383, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5128.765754699707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5317.273120880127, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6043.233585357666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5137.52067565918, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5034.200325012207, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5285.714225769043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5873.941116333008, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5220.712776184082, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5420.234508514404, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5621.2993240356445, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6132.96142578125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5686.548290252686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6098.449459075928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6154.724044799805, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5755.120105743408, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6574.627380371094, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6757.98210144043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6009.357624053955, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6320.3631591796875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6400.383834838867, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5999.11506652832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6616.667861938477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6775.768013000488, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6063.161392211914, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6345.622844696045, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6405.554294586182, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6012.612113952637, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6673.530235290527, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6786.916847229004, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6022.8545570373535, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6549.9711990356445, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6467.179985046387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6040.431880950928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6826.678466796875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6802.9462814331055, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8211.717071533203, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8330.993385314941, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8784.50366973877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9547.28042602539, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8966.997108459473, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9559.884376525879, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9305.77091217041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9317.547492980957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3478.3670234680176, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3544.8641777038574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3961.253433227539, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4073.9996910095215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3718.238410949707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3601.4801597595215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4304.349746704102, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4351.166572570801, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3526.5097427368164, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3548.5327911376953, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3757.4961853027344, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3938.906593322754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3788.4286499023438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3591.571846008301, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4310.843372344971, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4345.727672576904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3523.0323219299316, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3543.544807434082, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3777.9487800598145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3938.822555541992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3791.270046234131, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3602.6802825927734, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4314.800815582275, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4343.420448303223, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3523.5100746154785, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3520.556182861328, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3874.6030235290527, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3995.9201622009277, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3801.1135864257812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3535.406894683838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4327.811374664307, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4353.367824554443, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5194.7731590271, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4263.067665100098, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5988.117733001709, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4773.443756103516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5438.2037353515625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4118.177108764648, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6205.970993041992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4755.063171386719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5552.203845977783, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4125.783958435059, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6250.855541229248, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4761.8889808654785, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5633.015022277832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4221.579170227051, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6293.305606842041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4766.275691986084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2974.7632026672363, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3186.078233718872, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3173.4646320343018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3025.1212787628174, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3045.949754714966, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3118.4873485565186, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2880.361089706421, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2952.5241661071777, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2902.7505493164062, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2957.41117477417, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2950.740785598755, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2878.1841373443604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2844.810085296631, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2960.7401275634766, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2889.6886253356934, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2936.181116104126, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2960.477924346924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2851.6566467285156, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2838.7814331054688, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2989.2156887054443, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2934.9668979644775, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2933.474063873291, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2988.5433769226074, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2862.9895973205566, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3929.7780990600586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3835.437545776367, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4017.5388717651367, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3886.0865592956543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4079.34799194336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3920.0889778137207, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4156.153869628906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3978.694896697998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3539.1539573669434, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3115.3745555877686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3549.407501220703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3143.4239864349365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3560.8947372436523, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3156.056480407715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3599.956169128418, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3183.416795730591, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4415.585784912109, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4497.762222290039, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4350.323390960693, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4609.195499420166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4456.599960327148, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4383.949565887451, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4244.342555999756, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4514.317760467529, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5021.20512008667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5195.3972816467285, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4943.681774139404, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4939.017105102539, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4927.8521728515625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5021.814212799072, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4760.08638381958, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4788.812675476074, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5046.364765167236, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5197.506885528564, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4991.850051879883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5463.424320220947, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4956.982421875, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5017.5568199157715, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4799.3110275268555, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5310.731639862061, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5067.37154006958, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5207.324466705322, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5093.074531555176, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5885.851535797119, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4984.56579208374, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5019.412002563477, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4914.679298400879, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5748.238277435303, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5819.990863800049, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5617.7463722229, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6663.072319030762, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6541.342887878418, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6692.149505615234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6444.217224121094, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6744.459991455078, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6478.198013305664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3130.5244636535645, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3111.168165206909, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3048.563995361328, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3070.095043182373, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2946.063823699951, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3023.13871383667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3300.2358436584473, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3238.816967010498, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3205.173749923706, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3213.7282943725586, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3135.974712371826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3085.6080055236816, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3300.726737976074, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3261.236152648926, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3342.7804565429688, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3205.916805267334, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3147.18656539917, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3214.2523288726807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3315.5582427978516, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3296.5883255004883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3389.9593544006348, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3216.247844696045, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3172.9556941986084, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3275.124931335449, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4271.883354187012, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3635.2961921691895, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4542.604808807373, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4024.113445281983, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4598.404808044434, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4030.421142578125, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4692.101268768311, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4060.154399871826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3375.795021057129, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2596.9678592681885, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2999.666872024536, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2274.325580596924, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3539.1600036621094, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2617.7446269989014, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3128.3199977874756, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2301.5169620513916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3572.500991821289, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2636.0851192474365, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3148.9195251464844, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2308.2815837860107, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3645.4359817504883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2671.486873626709, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3201.6777515411377, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2331.713285446167, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12129.61498260498, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2472.450065612793, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12021.38786315918, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2516.5416049957275, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12109.438362121582, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2543.803997039795, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12266.20891571045, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2573.5532760620117, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5870.174865722656, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4383.220615386963, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5940.574893951416, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4443.357467651367, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6369.368801116943, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4693.510112762451, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6454.839515686035, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4678.781585693359, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6425.933494567871, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4726.06876373291, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6507.122688293457, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4708.105945587158, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6576.025466918945, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4785.804138183594, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6655.387382507324, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4762.782192230225, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5467.3956871032715, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3689.639949798584, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5539.336166381836, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3981.846866607666, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5637.489109039307, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4012.6147460937495, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5972.575836181641, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4119.997615814209, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 21874.033203125, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2898.939847946167, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 21101.767578125, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2996.2156677246094, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 21193.941802978516, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3030.837278366089, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 21327.536239624023, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
+{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3117.4169731140137, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}

From e324ca40ef4751d6d7e45c15858eccdcc3796e7c Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Fri, 18 Jul 2025 21:34:23 +0000
Subject: [PATCH 10/61] getting moe benchmark to run

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 ibm-triton-lib/ibm_triton_lib/kernels/fused_moe.py | 12 ++++++++----
 scripts/benchmark.py                               | 14 ++++++++++----
 scripts/setups/granite4_moe_0.conf                 |  7 ++++---
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/fused_moe.py b/ibm-triton-lib/ibm_triton_lib/kernels/fused_moe.py
index 2276a8713..ca12442a0 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/fused_moe.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/fused_moe.py
@@ -685,7 +685,7 @@ def get_moe_configs(
         os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
     if os.path.exists(config_file_path):
         with open(config_file_path) as f:
-            logger.info("Using configuration from %s for MoE layer.",
+            print("Using configuration from %s for MoE layer.",
                         config_file_path)
             # If a configuration has been found, return it
             return {int(key): val for key, val in json.load(f).items()}
@@ -713,7 +713,7 @@ def get_moe_configs(
             os.path.dirname(os.path.realpath(__file__)), "configs", fallback_json_file_name)
         if os.path.exists(fallback_config_file_path):
             with open(fallback_config_file_path) as f:
-                logger.warning(("Config file not found at %s. Trying to use next" \
+                print(("Config file not found at %s. Trying to use next" \
                                " best config at %s for MoE layer. Performance"
                                " might still be sub-optimal."), 
                                config_file_path, fallback_config_file_path)
@@ -721,7 +721,7 @@ def get_moe_configs(
         
     # If no optimized configuration is available (and heuristics is disabled),
     # we will use the default configuration
-    logger.warning(
+    print(
         ("Using default MoE config. Performance might be sub-optimal! "
          "Config file not found at %s"), config_file_path)
     return None
@@ -1286,6 +1286,7 @@ def fused_experts_impl(
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[list[int]] = None,
+    use_vllm_config=True,
 ) -> torch.Tensor:
     # Check constraints.
     if use_int4_w4a16:
@@ -1499,6 +1500,7 @@ def fused_moe(
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[list[int]] = None,
+    use_vllm_config=True,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -1586,7 +1588,9 @@ def fused_moe(
                          w2_zp=w2_zp,
                          a1_scale=a1_scale,
                          a2_scale=a2_scale,
-                         block_shape=block_shape)
+                         block_shape=block_shape,
+                         use_vllm_config=use_vllm_config,
+                         )
 
 
 # class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 8293e6b65..bb330cb68 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -68,6 +68,8 @@ class Implementation(Enum):
     UNF_TRITON_2D = 11
     UNF_TRITON_AUTO = 12
     PYTORCH_NATIVE = 13
+    TRITON_TUNED = 14
+    TRITON_VLLM = 15
 
 
 class BenchmarkMode(Enum):
@@ -1732,7 +1734,7 @@ def generate_dummy_data(batch_size):
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("max_value", MAX_VALUES)
-# @pytest.mark.parametrize("implementation", IMPLEMENTATION_UT)
+@pytest.mark.parametrize("implementation", IMPLEMENTATION_UT)
 @pytest.mark.parametrize("benchmark_mode", BENCHMARK_MODES)
 def test_fused_moe(
     capsys,
@@ -1747,7 +1749,7 @@ def test_fused_moe(
     dtype: torch.dtype,
     seed,
     max_value,
-    # implementation,
+    implementation,
     benchmark_mode,
 ):
     # based on: https://github.com/vllm-project/vllm/blob/main/tests/kernels/test_moe.py
@@ -1756,6 +1758,9 @@ def test_fused_moe(
     my_id = request.node.nodeid.split("::")[-1]
     my_name = my_id.split("[")[0]
     my_instance = my_id.split("[")[1][:-1]
+
+    if implementation not in [Implementation.TRITON_TUNED, Implementation.TRITON_VLLM]:
+        pytest.skip()
    
     def torch_moe(a, w1, w2, score, topk):
         B, D = a.shape
@@ -1826,9 +1831,10 @@ def torch_moe(a, w1, w2, score, topk):
                 each token is repeated, and N is the output feature dimension.
         """
 
-        # TODO: renormalize? 
+        
+        use_vllm_config = True if implementation == Implementation.TRITON_VLLM else False
         triton_output = fused_moe(a, w1, w2, input_gating, topk,
-                                  renormalize=True) #inplace=True ? 
+                                  renormalize=True, use_vllm_config=use_vllm_config) #inplace=True ? 
         assert triton_output is not None
         
         captured = ''
diff --git a/scripts/setups/granite4_moe_0.conf b/scripts/setups/granite4_moe_0.conf
index 1103ff8d6..8afeb56ad 100644
--- a/scripts/setups/granite4_moe_0.conf
+++ b/scripts/setups/granite4_moe_0.conf
@@ -9,10 +9,11 @@ TP_FACTOR = [1, 2]
 # DTYPES = ["bfloat16"]
 DTYPES = ["float16"]
 
-BENCHMARK_MODES = ["CUDA_EVENTS"]
-# BENCHMARK_MODES = ["CUDA_GRAPS"]
+# BENCHMARK_MODES = ["CUDA_EVENTS"]
+BENCHMARK_MODES = ["CUDA_GRAPHS"]
 
-IMPLEMENTATION_UT = ["BASELINE_TRITON"]  # some value for now
+# IMPLEMENTATION_UT = ["TRITON_TUNED", "TRITON_VLLM"]  
+IMPLEMENTATION_UT = ["TRITON_VLLM"]  
 
 # TRITON_BACKEND_DEBUG = 1
 # STORE_TEST_RESULT_PATH=/results

From b7161b7bd7a53ae85b6f04fa6bb4a21dfa1570e9 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Mon, 21 Jul 2025 05:46:21 -0400
Subject: [PATCH 11/61] tuning remaining parts...log lost due to restart

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 ...256,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++
 ...512,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++
 ...768,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++
 vllm                                          |   2 +-
 4 files changed, 439 insertions(+), 1 deletion(-)
 create mode 100644 E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json
 create mode 100644 E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
 create mode 100644 E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json

diff --git a/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json b/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..147a83660
--- /dev/null
+++ b/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..a01e9c317
--- /dev/null
+++ b/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json b/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..3caae02cb
--- /dev/null
+++ b/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm b/vllm
index 5be130601..8414a44a7 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit 5be1306019f7622b86c8f8aedfea477be83d4a21
+Subproject commit 8414a44a78b3718f0360b2ffa480e2e5210b5740

From 8c7acf4bb14d0d5a50b9ee9215676aceb436c05d Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Tue, 22 Jul 2025 10:14:54 -0400
Subject: [PATCH 12/61] benchmarking fused_moe

---
 ...256,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++
 ...512,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++
 ...384,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++
 ...768,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++
 .../ibm_triton_lib/kernels/fused_moe.py       |  10 +-
 scripts/benchmark.py                          |  13 +-
 scripts/high_qps_bench.sh                     |   3 +-
 scripts/setups/granite4_moe_0.conf            |   5 +-
 vllm                                          |   2 +-
 9 files changed, 603 insertions(+), 14 deletions(-)
 create mode 100644 ibm-triton-lib/ibm_triton_lib/kernels/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json
 create mode 100644 ibm-triton-lib/ibm_triton_lib/kernels/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
 create mode 100644 ibm-triton-lib/ibm_triton_lib/kernels/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
 create mode 100644 ibm-triton-lib/ibm_triton_lib/kernels/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json b/ibm-triton-lib/ibm_triton_lib/kernels/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..147a83660
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/configs/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/ibm-triton-lib/ibm_triton_lib/kernels/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..a01e9c317
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/configs/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json b/ibm-triton-lib/ibm_triton_lib/kernels/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..a7cfd175d
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/configs/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json b/ibm-triton-lib/ibm_triton_lib/kernels/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 000000000..3caae02cb
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/fused_moe.py b/ibm-triton-lib/ibm_triton_lib/kernels/fused_moe.py
index ca12442a0..03230e1f6 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/fused_moe.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/fused_moe.py
@@ -859,6 +859,7 @@ def try_get_optimal_moe_config(
     M: int,
     is_marlin: bool = False,
     block_shape: Optional[list[int]] = None,
+    force_default=False,
 ) -> dict[str, int]:
     from vllm.model_executor.layers.fused_moe import get_config
     override_config = get_config()
@@ -873,7 +874,7 @@ def try_get_optimal_moe_config(
         block_k = block_shape[1] if block_shape else 0
         configs = get_moe_configs(E, N, dtype, block_n, block_k)
 
-        if configs:
+        if configs and not force_default:
             # If an optimal configuration map has been found, look up the
             # optimal config
             config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
@@ -1286,7 +1287,7 @@ def fused_experts_impl(
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[list[int]] = None,
-    use_vllm_config=True,
+    use_default_config = False,
 ) -> torch.Tensor:
     # Check constraints.
     if use_int4_w4a16:
@@ -1336,6 +1337,7 @@ def fused_experts_impl(
         top_k_num,
         config_dtype,
         block_shape=block_shape,
+        force_default=use_default_config,
     )
 
     config = get_config_func(M)
@@ -1500,7 +1502,7 @@ def fused_moe(
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[list[int]] = None,
-    use_vllm_config=True,
+    use_default_config=True,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -1589,7 +1591,7 @@ def fused_moe(
                          a1_scale=a1_scale,
                          a2_scale=a2_scale,
                          block_shape=block_shape,
-                         use_vllm_config=use_vllm_config,
+                         use_default_config=use_default_config,
                          )
 
 
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index bb330cb68..b4619bb94 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -69,7 +69,7 @@ class Implementation(Enum):
     UNF_TRITON_AUTO = 12
     PYTORCH_NATIVE = 13
     TRITON_TUNED = 14
-    TRITON_VLLM = 15
+    TRITON_FALLBACK = 15
 
 
 class BenchmarkMode(Enum):
@@ -1759,7 +1759,7 @@ def test_fused_moe(
     my_name = my_id.split("[")[0]
     my_instance = my_id.split("[")[1][:-1]
 
-    if implementation not in [Implementation.TRITON_TUNED, Implementation.TRITON_VLLM]:
+    if implementation not in [Implementation.TRITON_TUNED, Implementation.TRITON_FALLBACK]:
         pytest.skip()
    
     def torch_moe(a, w1, w2, score, topk):
@@ -1832,9 +1832,9 @@ def torch_moe(a, w1, w2, score, topk):
         """
 
         
-        use_vllm_config = True if implementation == Implementation.TRITON_VLLM else False
+        use_default_config = True if implementation == Implementation.TRITON_FALLBACK else False
         triton_output = fused_moe(a, w1, w2, input_gating, topk,
-                                  renormalize=True, use_vllm_config=use_vllm_config) #inplace=True ? 
+                                  renormalize=True, use_default_config=use_default_config) #inplace=True ? 
         assert triton_output is not None
         
         captured = ''
@@ -1852,7 +1852,8 @@ def torch_moe(a, w1, w2, score, topk):
             allclose_pass = True
 
         call_func_under_test = lambda: fused_moe(a, w1, w2, input_gating, topk, 
-                                                 renormalize=True, inplace=True)
+                                                 renormalize=True, inplace=True,
+                                                 use_default_config=use_default_config)
 
         # benchmark only correct results
         if do_benchmarks:
@@ -1877,7 +1878,7 @@ def torch_moe(a, w1, w2, score, topk):
                 "topk": topk,
                 "max_value": max_value,
                 "dtype": dtype,
-                # "implementation": implementation,
+                "implementation": implementation,
                 "ms": ms,
                 "min_ms": min_ms,
                 "max_ms": max_ms,
diff --git a/scripts/high_qps_bench.sh b/scripts/high_qps_bench.sh
index 271b87d10..4812d8b5a 100755
--- a/scripts/high_qps_bench.sh
+++ b/scripts/high_qps_bench.sh
@@ -4,7 +4,8 @@
 
 # MODEL=meta-llama/Llama-3.1-8B-Instruct
 # MODEL=/net/storage149/autofs/css22/nmg/models/hf/ibm-granite/granite-4.0-tiny-preview/main/
-MODEL=/net/storage149/autofs/css22/nmg/models/hf/ibm-ai-platform/Bamba-9B-v1/main/
+MODEL=/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf
+# MODEL=/net/storage149/autofs/css22/nmg/models/hf/ibm-ai-platform/Bamba-9B-v1/main/
 REQUEST_RATES=(20 20 20)
 TOTAL_SECONDS=120
 
diff --git a/scripts/setups/granite4_moe_0.conf b/scripts/setups/granite4_moe_0.conf
index 8afeb56ad..69cf34aa9 100644
--- a/scripts/setups/granite4_moe_0.conf
+++ b/scripts/setups/granite4_moe_0.conf
@@ -5,6 +5,7 @@ SEQUENCE_LENGTHS = [16, 32, 64, 128, 512, 1024, 2048, 4096]
 MOE_N = [768] # intermediate size
 MOE_K = [4096] # hidden size
 MOE_TOP_K = [10] # num_experts_per_tok
+MOE_NUM_EXPERTS = [72]
 TP_FACTOR = [1, 2]
 # DTYPES = ["bfloat16"]
 DTYPES = ["float16"]
@@ -12,8 +13,8 @@ DTYPES = ["float16"]
 # BENCHMARK_MODES = ["CUDA_EVENTS"]
 BENCHMARK_MODES = ["CUDA_GRAPHS"]
 
-# IMPLEMENTATION_UT = ["TRITON_TUNED", "TRITON_VLLM"]  
-IMPLEMENTATION_UT = ["TRITON_VLLM"]  
+IMPLEMENTATION_UT = ["TRITON_TUNED", "TRITON_FALLBACK"]  
+# IMPLEMENTATION_UT = ["TRITON_FALLBACK"]  
 
 # TRITON_BACKEND_DEBUG = 1
 # STORE_TEST_RESULT_PATH=/results
diff --git a/vllm b/vllm
index 8414a44a7..aa0dc77ef 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit 8414a44a78b3718f0360b2ffa480e2e5210b5740
+Subproject commit aa0dc77ef53b365ddf54be51748c166895a0bcd9

From ba3ef68244a97c1fac8bee5c69affdca14fca705 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 23 Jul 2025 09:40:27 -0400
Subject: [PATCH 13/61] setup ttft sweeps

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/bench_vllm_latency_range.py | 38 +++++++++++++++++++----------
 scripts/setups/granite4_moe_0.conf  |  6 +++--
 vllm                                |  2 +-
 3 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/scripts/bench_vllm_latency_range.py b/scripts/bench_vllm_latency_range.py
index a12716ae5..4f3fe3a64 100644
--- a/scripts/bench_vllm_latency_range.py
+++ b/scripts/bench_vllm_latency_range.py
@@ -20,6 +20,7 @@
 import sys
 import torch
 from datetime import datetime
+from itertools import zip_longest, repeat, chain
 
 
 def create_dir_if_not_exist_recursive(path, mode=0o777):
@@ -42,24 +43,31 @@ def create_dir_if_not_exist(path, mode=0o777):
             print(f"can't set permission of directory {path}: {e}")
 
 
-if len(sys.argv) < 4:
-    print(f"Usage: {sys.argv[0]} <model_path> <testcase_name> <result_path>")
+if len(sys.argv) < 5:
+    print(f"Usage: {sys.argv[0]} <model_path> <tp-factor> <testcase_name> <result_path>")
+    exit(-1)
 
 selected_batch_sizes = [1]  # [4, 16, 32] #,128]
-selected_input_lengths = [500]  # , 1000, 1500, 2000, 4000, 8000, 16000]
-selected_output_lengths = [10, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
+# selected_input_lengths = [500]  # , 1000, 1500, 2000, 4000, 8000, 16000]
+# selected_output_lengths = [10, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
+selected_input_lengths = [64, 128, 512, 1024, 2048, 4096]
+selected_output_lengths = [1]
 
 gpu_name = torch.cuda.get_device_name().replace(" ", "_").replace("/", "_")
 
 # model = "/model/llama3.1-8b/instruct/"
 model = sys.argv[1]
-testcase_name = sys.argv[2]
-result_path = os.path.abspath(sys.argv[3])
+tp = int(sys.argv[2])
+testcase_name = sys.argv[3]
+result_path = os.path.abspath(sys.argv[4])
 
 # max_rounds = 128
 max_rounds = 64
 max_num_prompts = 1000
 
+warmup_iterations = 3
+iterations = 3 
+
 timestamp_f = datetime.now().strftime("%Y-%m-%d_%H%M")
 
 # result_dir = f"/results/{model.replace('/','-')}/{gpu_name}/{testcase_name}"
@@ -75,14 +83,15 @@ def create_dir_if_not_exist(path, mode=0o777):
         print(f"can't find benchmark script benchmark_latency.py")
         exit(-1)
 
-# Assisted by watsonx Code Assistant
-from itertools import zip_longest
-
+max_length = max(len(selected_batch_sizes), len(selected_input_lengths), len(selected_output_lengths))
 zipped_lists = list(
     zip_longest(
-        selected_batch_sizes,
-        selected_input_lengths,
-        selected_output_lengths,
+        chain(selected_batch_sizes, 
+              repeat(selected_batch_sizes[-1], times=max_length-len(selected_batch_sizes))),
+        chain(selected_input_lengths, 
+              repeat(selected_input_lengths[-1], times=max_length-len(selected_input_lengths))),
+        chain(selected_output_lengths, 
+              repeat(selected_output_lengths[-1], times=max_length-len(selected_output_lengths))),
         fillvalue=None,
     )
 )
@@ -99,7 +108,10 @@ def create_dir_if_not_exist(path, mode=0o777):
         f"VLLM_USE_V1=1 python {bench_script} "
         f"--model {model} "
         f"--input-len {il} --output-len {ol} --batch-size {bs} "
-        f"--output-json {json_file_name}"
+        f"--output-json {json_file_name} "
+        f"--num-iters-warmup {warmup_iterations} "
+        f"--num-iters {iterations} "
+        f"--tensor-parallel {tp} "
     )
     print(cmd)
     rv = os.system(cmd)
diff --git a/scripts/setups/granite4_moe_0.conf b/scripts/setups/granite4_moe_0.conf
index 69cf34aa9..0792807f2 100644
--- a/scripts/setups/granite4_moe_0.conf
+++ b/scripts/setups/granite4_moe_0.conf
@@ -2,10 +2,12 @@ BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64, 128]
 # BATCH_SIZES = [4]
 SEQUENCE_LENGTHS = [16, 32, 64, 128, 512, 1024, 2048, 4096]
 
-MOE_N = [768] # intermediate size
+# MOE_N = [768] # intermediate size, g4s
+MOE_N = [512] # g4t
 MOE_K = [4096] # hidden size
 MOE_TOP_K = [10] # num_experts_per_tok
-MOE_NUM_EXPERTS = [72]
+# MOE_NUM_EXPERTS = [72] #g4s
+MOE_NUM_EXPERTS = [62] #g4t
 TP_FACTOR = [1, 2]
 # DTYPES = ["bfloat16"]
 DTYPES = ["float16"]
diff --git a/vllm b/vllm
index aa0dc77ef..84c75250a 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit aa0dc77ef53b365ddf54be51748c166895a0bcd9
+Subproject commit 84c75250a98b55204db920aa254ab38e6b820d1c

From 2e129b87f468a0cdf08b8bb7ec70c9e3ccdcf891 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Thu, 24 Jul 2025 03:14:29 -0400
Subject: [PATCH 14/61] first tuning night

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                        | 26 ++++++++++++++++++
 .../default/cache.json                        | 25 +++++++++++++++++
 .../default/cache.json                        | 27 +++++++++++++++++++
 .../default/cache.json                        | 26 ++++++++++++++++++
 .../default/cache.json                        | 26 ++++++++++++++++++
 .../default/cache.json                        |  8 ++++++
 .../default/cache.json                        | 24 +++++++++++++++++
 .../ibm_triton_lib/kernels/mamba_ssm.py       |  2 +-
 vllm                                          |  2 +-
 9 files changed, 164 insertions(+), 2 deletions(-)
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json

diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
new file mode 100755
index 000000000..0312c1d6e
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
@@ -0,0 +1,26 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_bmm:_bmm_chunk_fwd_kernel)",
+    "total_bench_time_s": 10756.567904472351,
+    "evaluated_configs": 2625,
+    "keys": [
+        "chunk_size",
+        "K",
+        "IS_CAUSAL"
+    ],
+    "cache": {
+        "('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": [
+            0.002230335958302021
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
new file mode 100755
index 000000000..e6e0dc8a0
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
@@ -0,0 +1,25 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_cumsum_fwd_kernel)",
+    "total_bench_time_s": 7.361271619796753,
+    "evaluated_configs": 7,
+    "keys": [
+        "chunk_size",
+        "nheads"
+    ],
+    "cache": {
+        "('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": "BLOCK_SIZE_H: 2, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": [
+            0.002133406000211835
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
new file mode 100755
index 000000000..dd9c29f78
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
@@ -0,0 +1,27 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_scan:_chunk_scan_fwd_kernel)",
+    "total_bench_time_s": 15278.822125434875,
+    "evaluated_configs": 2625,
+    "keys": [
+        "chunk_size",
+        "hdim",
+        "dstate",
+        "IS_CAUSAL"
+    ],
+    "cache": {
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": [
+            0.014237518422305584
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
new file mode 100755
index 000000000..68505b261
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
@@ -0,0 +1,26 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_fwd_kernel)",
+    "total_bench_time_s": 9348.028031349182,
+    "evaluated_configs": 2625,
+    "keys": [
+        "hdim",
+        "dstate",
+        "chunk_size"
+    ],
+    "cache": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 64, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": [
+            0.003924777265638113
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
new file mode 100755
index 000000000..06f0a4220
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
@@ -0,0 +1,26 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_varlen_kernel)",
+    "total_bench_time_s": 19485.390374183655,
+    "evaluated_configs": 2625,
+    "keys": [
+        "hdim",
+        "dstate",
+        "chunk_size"
+    ],
+    "cache": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16')": "BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 16, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16')": [
+            NaN
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
new file mode 100755
index 000000000..550944b2a
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.mamba_ssm:_selective_scan_update_kernel)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json
new file mode 100755
index 000000000..58f89f93d
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json
@@ -0,0 +1,24 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_state_passing:_state_passing_fwd_kernel)",
+    "total_bench_time_s": 275.2601103782654,
+    "evaluated_configs": 168,
+    "keys": [
+        "dim"
+    ],
+    "cache": {
+        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE: 512, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')": [
+            0.0030820679385215044
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/mamba_ssm.py b/ibm-triton-lib/ibm_triton_lib/kernels/mamba_ssm.py
index 79e519b2d..0bcbfdfb5 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/mamba_ssm.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/mamba_ssm.py
@@ -85,7 +85,7 @@ def fallback_heuristic_simple(key):
     config_space=triton_dejavu.ConfigSpace(
         {"BLOCK_SIZE_M": [4, 8, 16, 32, 64]},
         num_warps=[2, 4, 8],
-        num_stages=[1, 2, 4, 6, 8],
+        num_stages=[1, 2, 3, 4, 5, 6, 8],
     ),
     key=[
         "dstate",
diff --git a/vllm b/vllm
index 84c75250a..1f6be7ff0 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit 84c75250a98b55204db920aa254ab38e6b820d1c
+Subproject commit 1f6be7ff01b67e2551e3b6b9b7a8933dc0553bd8

From b37a42032981fbe7ef34e4bd3fa1caf8c8c4e380 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Thu, 24 Jul 2025 03:22:37 -0400
Subject: [PATCH 15/61] adding tune log for analysis

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 tuning_0.log | 113327 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 113327 insertions(+)
 create mode 100644 tuning_0.log

diff --git a/tuning_0.log b/tuning_0.log
new file mode 100644
index 000000000..9737a859d
--- /dev/null
+++ b/tuning_0.log
@@ -0,0 +1,113327 @@
+INFO 07-23 11:40:53 [__init__.py:235] Automatically detected platform cuda.
+Namespace(input_len=64, output_len=1, batch_size=1, n=1, use_beam_search=False, num_iters_warmup=3, num_iters=3, profile=False, output_json='/home/zrlngl/watsonx/zrl-triton-results-and-notebooks/vllm_benchmarks_latency/-net-storage149-autofs-css22-nmg-models-cos-1bfc857-fmaas-integration-tests-models-granite-4_0-small-base-pipecleaner-hf/NVIDIA_H100_80GB_HBM3/tuning_ignore/exp_2025-07-23_1140//result_bs_1_il_64_ol_1.json', disable_detokenize=False, model='/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf', task='auto', tokenizer=None, tokenizer_mode='auto', trust_remote_code=False, dtype='auto', seed=None, hf_config_path=None, allowed_local_media_path='', revision=None, code_revision=None, rope_scaling={}, rope_theta=None, tokenizer_revision=None, max_model_len=None, quantization=None, enforce_eager=False, max_seq_len_to_capture=8192, max_logprobs=20, logprobs_mode='raw_logprobs', disable_sliding_window=False, disable_cascade_attn=False, skip_tokenizer_init=False, enable_prompt_embeds=False, served_model_name=None, disable_async_output_proc=False, config_format='auto', hf_token=None, hf_overrides={}, override_neuron_config={}, override_pooler_config=None, logits_processor_pattern=None, generation_config='auto', override_generation_config={}, enable_sleep_mode=False, model_impl='auto', override_attention_dtype=None, load_format='auto', download_dir=None, model_loader_extra_config={}, ignore_patterns=None, use_tqdm_on_load=True, pt_load_map_location='cpu', guided_decoding_backend='auto', guided_decoding_disable_fallback=False, guided_decoding_disable_any_whitespace=False, guided_decoding_disable_additional_properties=False, reasoning_parser='', distributed_executor_backend=None, pipeline_parallel_size=1, tensor_parallel_size=1, data_parallel_size=1, data_parallel_rank=None, data_parallel_size_local=None, data_parallel_address=None, data_parallel_rpc_port=None, data_parallel_backend='mp', enable_expert_parallel=False, enable_eplb=False, num_redundant_experts=0, eplb_window_size=1000, eplb_step_interval=3000, eplb_log_balancedness=False, max_parallel_loading_workers=None, ray_workers_use_nsight=False, disable_custom_all_reduce=False, worker_cls='auto', worker_extension_cls='', enable_multimodal_encoder_data_parallel=False, block_size=None, gpu_memory_utilization=0.9, swap_space=4, kv_cache_dtype='auto', num_gpu_blocks_override=None, enable_prefix_caching=False, prefix_caching_hash_algo='builtin', cpu_offload_gb=0, calculate_kv_scales=False, limit_mm_per_prompt={}, media_io_kwargs={}, mm_processor_kwargs=None, disable_mm_preprocessor_cache=False, interleave_mm_strings=False, enable_lora=None, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', max_cpu_loras=None, fully_sharded_loras=False, default_mm_loras=None, enable_prompt_adapter=None, max_prompt_adapters=1, max_prompt_adapter_token=0, speculative_config=None, show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, max_num_batched_tokens=None, max_num_seqs=None, max_num_partial_prefills=1, max_long_partial_prefills=1, cuda_graph_sizes=[], long_prefill_token_threshold=0, num_lookahead_slots=0, scheduler_delay_factor=0.0, preemption_mode=None, num_scheduler_steps=1, multi_step_stream_outputs=True, scheduling_policy='fcfs', enable_chunked_prefill=None, disable_chunked_mm_input=False, scheduler_cls='vllm.core.scheduler.Scheduler', disable_hybrid_kv_cache_manager=False, async_scheduling=False, kv_transfer_config=None, kv_events_config=None, compilation_config={"level":0,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":[],"use_inductor":true,"compile_sizes":null,"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":0,"cudagraph_capture_sizes":null,"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":null,"local_cache_dir":null}, additional_config={}, disable_log_stats=False)
+ERROR 07-23 11:41:01 [config.py:133] Error retrieving safetensors: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf'. Use `repo_type` argument if needed., retrying 1 of 2
+ERROR 07-23 11:41:03 [config.py:131] Error retrieving safetensors: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf'. Use `repo_type` argument if needed.
+INFO 07-23 11:41:03 [config.py:3483] Downcasting torch.float32 to torch.bfloat16.
+INFO 07-23 11:41:03 [config.py:1602] Using max model len 132096
+WARNING 07-23 11:41:03 [arg_utils.py:1684] Detected VLLM_USE_V1=1 with Mamba. Usage should be considered experimental. Please report any issues on Github.
+INFO 07-23 11:41:03 [config.py:2424] Chunked prefill is enabled with max_num_batched_tokens=16384.
+INFO 07-23 11:41:03 [config.py:214] Setting max_seq_len_to_capture to 132096 to ensure that CUDA graph capture covers sequences of length up to max_model_len.
+[triton-dejavu] generated 75 configurations out of ConfigSpace: BLOCK_SIZE_M: [4, 8, 16, 32, 64], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 4, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
+[triton-dejavu] restored 0 configurations for _selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default.
+[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
+[triton-dejavu] restored 0 configurations for _bmm_chunk_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
+[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
+[triton-dejavu] restored 0 configurations for _chunk_scan_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
+[triton-dejavu] restored 1 configurations for _chunk_cumsum_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default.
+[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
+[triton-dejavu] restored 0 configurations for _chunk_state_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
+[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
+[triton-dejavu] restored 0 configurations for _chunk_state_varlen_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
+[triton-dejavu] generated 168 configurations out of ConfigSpace: BLOCK_SIZE: [32, 64, 128, 256, 512, 1024, 2048, 4096], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
+[triton-dejavu] restored 0 configurations for _state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default.
+INFO 07-23 11:41:05 [config.py:279] Setting attention block size to 528 tokens to ensure that attention page size is >= mamba page size.
+INFO 07-23 11:41:05 [config.py:300] Padding mamba page size by 0.69% to ensure that mamba page size and attention page size are exactly equal.
+WARNING 07-23 11:41:05 [__init__.py:2904] We must use the `spawn` multiprocessing start method. Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. Reason: CUDA is initialized
+INFO 07-23 11:41:09 [__init__.py:235] Automatically detected platform cuda.
+INFO 07-23 11:41:10 [core.py:553] Waiting for init message from front-end.
+INFO 07-23 11:41:10 [core.py:71] Initializing a V1 LLM engine (v0.1.dev7919+g84c7525) with config: model='/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf', speculative_config=None, tokenizer='/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=132096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":512,"local_cache_dir":null}
+[triton-dejavu] generated 75 configurations out of ConfigSpace: BLOCK_SIZE_M: [4, 8, 16, 32, 64], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 4, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
+[triton-dejavu] restored 0 configurations for _selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default.
+[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
+[triton-dejavu] restored 0 configurations for _bmm_chunk_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
+[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
+[triton-dejavu] restored 0 configurations for _chunk_scan_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
+[triton-dejavu] restored 1 configurations for _chunk_cumsum_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default.
+[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
+[triton-dejavu] restored 0 configurations for _chunk_state_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
+[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
+[triton-dejavu] restored 0 configurations for _chunk_state_varlen_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
+[triton-dejavu] generated 168 configurations out of ConfigSpace: BLOCK_SIZE: [32, 64, 128, 256, 512, 1024, 2048, 4096], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
+[triton-dejavu] restored 0 configurations for _state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default.
+INFO 07-23 11:41:12 [parallel_state.py:1102] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
+INFO 07-23 11:41:12 [topk_topp_sampler.py:49] Using FlashInfer for top-p & top-k sampling.
+INFO 07-23 11:41:12 [gpu_model_runner.py:1793] Starting to load model /net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf...
+INFO 07-23 11:41:12 [gpu_model_runner.py:1826] Loading model from scratch...
+INFO 07-23 11:41:12 [cuda.py:246] Using FlashInfer backend on V1 engine.
+INFO 07-23 11:41:24 [default_loader.py:262] Loading weights took 11.59 seconds
+INFO 07-23 11:41:24 [gpu_model_runner.py:1850] Model loading took 60.0260 GiB and 11.687507 seconds
+INFO 07-23 11:41:27 [backends.py:530] Using cache directory: /home/zrlngl/.cache/vllm/torch_compile_cache/9bcd1b9f98/rank_0_0/backbone for vLLM's torch.compile
+INFO 07-23 11:41:27 [backends.py:541] Dynamo bytecode transform time: 2.60 s
+INFO 07-23 11:41:29 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 1.788 s
+INFO 07-23 11:41:29 [fused_moe.py:688] Using configuration from /home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json for MoE layer.
+INFO 07-23 11:41:29 [monitor.py:34] torch.compile takes 2.60 s in total
+INFO 07-23 11:41:30 [gpu_worker.py:245] Available KV cache memory: 8.99 GiB
+INFO 07-23 11:41:31 [kv_cache_utils.py:997] GPU KV cache size: 58,608 tokens
+INFO 07-23 11:41:31 [kv_cache_utils.py:1001] Maximum concurrency for 132,096 tokens per request: 4.29x
+INFO 07-23 11:41:51 [gpu_model_runner.py:2395] Graph capturing finished in 20 secs, took 0.93 GiB
+INFO 07-23 11:41:51 [core.py:193] init engine (profile, create kv cache, warmup model) took 26.97 seconds
+INFO 07-23 11:41:51 [config.py:214] Setting max_seq_len_to_capture to 132096 to ensure that CUDA graph capture covers sequences of length up to max_model_len.
+SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=True, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None)
+Warming up...
+[triton-dejavu] ('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32') not in cache, starting to tune...
+[triton-dejavu] [2025-07-23 11:41:51]  Started benchmarking of 2625 configurations... (use_bo: False, run: 0)
+[triton-dejavu] First execution including JIT compilation took 0.003792285919189453s.
+[triton-dejavu] First execution including JIT compilation took 0.0024290084838867188s.
+[triton-dejavu] First execution including JIT compilation took 0.002718687057495117s.
+[triton-dejavu] First execution including JIT compilation took 0.0024924278259277344s.
+[triton-dejavu] First execution including JIT compilation took 0.002490520477294922s.
+[triton-dejavu] First execution including JIT compilation took 0.002441883087158203s.
+[triton-dejavu] First execution including JIT compilation took 0.00249481201171875s.
+[triton-dejavu] First execution including JIT compilation took 0.0025339126586914062s.
+[triton-dejavu] First execution including JIT compilation took 0.0025014877319335938s.
+[triton-dejavu] First execution including JIT compilation took 0.0037887096405029297s.
+[triton-dejavu] First execution including JIT compilation took 0.0025954246520996094s.
+[triton-dejavu] First execution including JIT compilation took 0.026740074157714844s.
+[triton-dejavu] First execution including JIT compilation took 0.0025556087493896484s.
+[triton-dejavu] First execution including JIT compilation took 0.002724885940551758s.
+[triton-dejavu] First execution including JIT compilation took 0.002470731735229492s.
+[triton-dejavu] First execution including JIT compilation took 0.0026400089263916016s.
+[triton-dejavu] First execution including JIT compilation took 0.00249481201171875s.
+[triton-dejavu] First execution including JIT compilation took 0.0030193328857421875s.
+[triton-dejavu] First execution including JIT compilation took 0.0026063919067382812s.
+[triton-dejavu] First execution including JIT compilation took 0.002532482147216797s.
+[triton-dejavu] First execution including JIT compilation took 0.0024771690368652344s.
+[triton-dejavu] First execution including JIT compilation took 0.0025453567504882812s.
+[triton-dejavu] First execution including JIT compilation took 0.0025365352630615234s.
+[triton-dejavu] First execution including JIT compilation took 0.002547740936279297s.
+[triton-dejavu] First execution including JIT compilation took 0.0024492740631103516s.
+[triton-dejavu] First execution including JIT compilation took 0.002447366714477539s.
+[triton-dejavu] First execution including JIT compilation took 0.002489328384399414s.
+[triton-dejavu] First execution including JIT compilation took 0.0026972293853759766s.
+[triton-dejavu] First execution including JIT compilation took 0.002499103546142578s.
+[triton-dejavu] First execution including JIT compilation took 0.002460479736328125s.
+[triton-dejavu] First execution including JIT compilation took 0.0026140213012695312s.
+[triton-dejavu] First execution including JIT compilation took 0.0025260448455810547s.
+[triton-dejavu] First execution including JIT compilation took 0.002526998519897461s.
+[triton-dejavu] First execution including JIT compilation took 0.002600431442260742s.
+[triton-dejavu] First execution including JIT compilation took 0.002473115921020508s.
+[triton-dejavu] First execution including JIT compilation took 0.0025148391723632812s.
+[triton-dejavu] First execution including JIT compilation took 0.002527475357055664s.
+[triton-dejavu] First execution including JIT compilation took 0.0025048255920410156s.
+[triton-dejavu] First execution including JIT compilation took 0.0025298595428466797s.
+[triton-dejavu] First execution including JIT compilation took 0.0027625560760498047s.
+[triton-dejavu] First execution including JIT compilation took 0.0024726390838623047s.
+[triton-dejavu] First execution including JIT compilation took 0.02015542984008789s.
+[triton-dejavu] First execution including JIT compilation took 0.0024602413177490234s.
+[triton-dejavu] First execution including JIT compilation took 0.00244903564453125s.
+[triton-dejavu] First execution including JIT compilation took 0.0025997161865234375s.
+[triton-dejavu] First execution including JIT compilation took 0.002533435821533203s.
+[triton-dejavu] First execution including JIT compilation took 0.0026590824127197266s.
+[triton-dejavu] First execution including JIT compilation took 0.002520322799682617s.
+[triton-dejavu] First execution including JIT compilation took 0.0027184486389160156s.
+[triton-dejavu] First execution including JIT compilation took 0.002468109130859375s.
+[triton-dejavu] First execution including JIT compilation took 0.002460479736328125s.
+[triton-dejavu] First execution including JIT compilation took 0.002536773681640625s.
+[triton-dejavu] First execution including JIT compilation took 0.0024209022521972656s.
+[triton-dejavu] First execution including JIT compilation took 0.0024750232696533203s.
+[triton-dejavu] First execution including JIT compilation took 0.002548694610595703s.
+[triton-dejavu] First execution including JIT compilation took 0.002722024917602539s.
+[triton-dejavu] First execution including JIT compilation took 0.0025467872619628906s.
+[triton-dejavu] First execution including JIT compilation took 0.0026938915252685547s.
+[triton-dejavu] First execution including JIT compilation took 0.002567768096923828s.
+[triton-dejavu] First execution including JIT compilation took 0.0025353431701660156s.
+[triton-dejavu] First execution including JIT compilation took 0.36343860626220703s.
+[triton-dejavu] First execution including JIT compilation took 0.2895364761352539s.
+[triton-dejavu] First execution including JIT compilation took 0.28336596488952637s.
+[triton-dejavu] First execution including JIT compilation took 0.24683427810668945s.
+[triton-dejavu] First execution including JIT compilation took 0.23354220390319824s.
+[triton-dejavu] First execution including JIT compilation took 0.2026069164276123s.
+[triton-dejavu] First execution including JIT compilation took 0.361889123916626s.
+[triton-dejavu] First execution including JIT compilation took 0.32724452018737793s.
+[triton-dejavu] First execution including JIT compilation took 0.2289412021636963s.
+[triton-dejavu] First execution including JIT compilation took 0.3778233528137207s.
+[triton-dejavu] First execution including JIT compilation took 0.35303163528442383s.
+[triton-dejavu] First execution including JIT compilation took 0.25978708267211914s.
+[triton-dejavu] First execution including JIT compilation took 0.4000725746154785s.
+[triton-dejavu] First execution including JIT compilation took 0.37045931816101074s.
+[triton-dejavu] First execution including JIT compilation took 0.2568087577819824s.
+[triton-dejavu] First execution including JIT compilation took 0.4205307960510254s.
+[triton-dejavu] First execution including JIT compilation took 0.3958923816680908s.
+[triton-dejavu] First execution including JIT compilation took 0.27231621742248535s.
+[triton-dejavu] First execution including JIT compilation took 0.4481041431427002s.
+[triton-dejavu] First execution including JIT compilation took 0.34272170066833496s.
+[triton-dejavu] First execution including JIT compilation took 0.23339176177978516s.
+[triton-dejavu] First execution including JIT compilation took 0.39439821243286133s.
+[triton-dejavu] First execution including JIT compilation took 0.3556709289550781s.
+[triton-dejavu] First execution including JIT compilation took 0.2575538158416748s.
+[triton-dejavu] First execution including JIT compilation took 0.3856210708618164s.
+[triton-dejavu] First execution including JIT compilation took 0.29673099517822266s.
+[triton-dejavu] First execution including JIT compilation took 0.2755117416381836s.
+[triton-dejavu] First execution including JIT compilation took 0.5427765846252441s.
+[triton-dejavu] First execution including JIT compilation took 0.4995570182800293s.
+[triton-dejavu] First execution including JIT compilation took 0.4783041477203369s.
+[triton-dejavu] First execution including JIT compilation took 0.631375789642334s.
+[triton-dejavu] First execution including JIT compilation took 0.5344967842102051s.
+[triton-dejavu] First execution including JIT compilation took 0.4600377082824707s.
+[triton-dejavu] First execution including JIT compilation took 0.4879109859466553s.
+[triton-dejavu] First execution including JIT compilation took 0.47121238708496094s.
+[triton-dejavu] First execution including JIT compilation took 0.41219019889831543s.
+[triton-dejavu] First execution including JIT compilation took 0.6414506435394287s.
+[triton-dejavu] First execution including JIT compilation took 0.5922412872314453s.
+[triton-dejavu] First execution including JIT compilation took 0.5718593597412109s.
+[triton-dejavu] First execution including JIT compilation took 0.7388913631439209s.
+[triton-dejavu] First execution including JIT compilation took 0.6281144618988037s.
+[triton-dejavu] First execution including JIT compilation took 0.5711205005645752s.
+[triton-dejavu] First execution including JIT compilation took 0.9001131057739258s.
+[triton-dejavu] First execution including JIT compilation took 0.681952953338623s.
+[triton-dejavu] First execution including JIT compilation took 0.628960132598877s.
+[triton-dejavu] First execution including JIT compilation took 0.19681072235107422s.
+[triton-dejavu] First execution including JIT compilation took 0.1845228672027588s.
+[triton-dejavu] First execution including JIT compilation took 0.18219757080078125s.
+[triton-dejavu] First execution including JIT compilation took 0.218735933303833s.
+[triton-dejavu] First execution including JIT compilation took 0.20905685424804688s.
+[triton-dejavu] First execution including JIT compilation took 0.21239614486694336s.
+[triton-dejavu] First execution including JIT compilation took 0.2337355613708496s.
+[triton-dejavu] First execution including JIT compilation took 0.22266483306884766s.
+[triton-dejavu] First execution including JIT compilation took 0.21049857139587402s.
+[triton-dejavu] First execution including JIT compilation took 0.2469940185546875s.
+[triton-dejavu] First execution including JIT compilation took 0.24105095863342285s.
+[triton-dejavu] First execution including JIT compilation took 0.22619390487670898s.
+[triton-dejavu] First execution including JIT compilation took 0.25737953186035156s.
+[triton-dejavu] First execution including JIT compilation took 0.24932122230529785s.
+[triton-dejavu] First execution including JIT compilation took 0.2292931079864502s.
+[triton-dejavu] First execution including JIT compilation took 0.2662630081176758s.
+[triton-dejavu] First execution including JIT compilation took 0.25505638122558594s.
+[triton-dejavu] First execution including JIT compilation took 0.23747634887695312s.
+[triton-dejavu] First execution including JIT compilation took 0.2888965606689453s.
+[triton-dejavu] First execution including JIT compilation took 0.27660059928894043s.
+[triton-dejavu] First execution including JIT compilation took 0.2541189193725586s.
+[triton-dejavu] First execution including JIT compilation took 0.21480035781860352s.
+[triton-dejavu] First execution including JIT compilation took 0.1914529800415039s.
+[triton-dejavu] First execution including JIT compilation took 0.18795394897460938s.
+[triton-dejavu] First execution including JIT compilation took 0.24895811080932617s.
+[triton-dejavu] First execution including JIT compilation took 0.216827392578125s.
+[triton-dejavu] First execution including JIT compilation took 0.21476054191589355s.
+[triton-dejavu] First execution including JIT compilation took 0.26205873489379883s.
+[triton-dejavu] First execution including JIT compilation took 0.23574447631835938s.
+[triton-dejavu] First execution including JIT compilation took 0.22771525382995605s.
+[triton-dejavu] First execution including JIT compilation took 0.2744171619415283s.
+[triton-dejavu] First execution including JIT compilation took 0.24593615531921387s.
+[triton-dejavu] First execution including JIT compilation took 0.25572872161865234s.
+[triton-dejavu] First execution including JIT compilation took 0.2906627655029297s.
+[triton-dejavu] First execution including JIT compilation took 0.2578747272491455s.
+[triton-dejavu] First execution including JIT compilation took 0.2551157474517822s.
+[triton-dejavu] First execution including JIT compilation took 0.307572603225708s.
+[triton-dejavu] First execution including JIT compilation took 0.2733023166656494s.
+[triton-dejavu] First execution including JIT compilation took 0.2657465934753418s.
+[triton-dejavu] First execution including JIT compilation took 0.34859251976013184s.
+[triton-dejavu] First execution including JIT compilation took 0.2858898639678955s.
+[triton-dejavu] First execution including JIT compilation took 0.2817537784576416s.
+[triton-dejavu] First execution including JIT compilation took 0.24785876274108887s.
+[triton-dejavu] First execution including JIT compilation took 0.214003324508667s.
+[triton-dejavu] First execution including JIT compilation took 0.20161771774291992s.
+[triton-dejavu] First execution including JIT compilation took 0.30726170539855957s.
+[triton-dejavu] First execution including JIT compilation took 0.2544825077056885s.
+[triton-dejavu] First execution including JIT compilation took 0.22570061683654785s.
+[triton-dejavu] First execution including JIT compilation took 0.3320579528808594s.
+[triton-dejavu] First execution including JIT compilation took 0.2685830593109131s.
+[triton-dejavu] First execution including JIT compilation took 0.23553252220153809s.
+[triton-dejavu] First execution including JIT compilation took 0.34238600730895996s.
+[triton-dejavu] First execution including JIT compilation took 0.2860074043273926s.
+[triton-dejavu] First execution including JIT compilation took 0.24680185317993164s.
+[triton-dejavu] First execution including JIT compilation took 0.3659553527832031s.
+[triton-dejavu] First execution including JIT compilation took 0.2950880527496338s.
+[triton-dejavu] First execution including JIT compilation took 0.2600231170654297s.
+[triton-dejavu] First execution including JIT compilation took 0.38948678970336914s.
+[triton-dejavu] First execution including JIT compilation took 0.3203599452972412s.
+[triton-dejavu] First execution including JIT compilation took 0.2689199447631836s.
+[triton-dejavu] First execution including JIT compilation took 0.42819809913635254s.
+[triton-dejavu] First execution including JIT compilation took 0.3495504856109619s.
+[triton-dejavu] First execution including JIT compilation took 0.2916533946990967s.
+[triton-dejavu] First execution including JIT compilation took 0.3085203170776367s.
+[triton-dejavu] First execution including JIT compilation took 0.26044535636901855s.
+[triton-dejavu] First execution including JIT compilation took 0.24263620376586914s.
+[triton-dejavu] First execution including JIT compilation took 0.4157595634460449s.
+[triton-dejavu] First execution including JIT compilation took 0.358691930770874s.
+[triton-dejavu] First execution including JIT compilation took 0.2635207176208496s.
+[triton-dejavu] First execution including JIT compilation took 0.4522533416748047s.
+[triton-dejavu] First execution including JIT compilation took 0.39212489128112793s.
+[triton-dejavu] First execution including JIT compilation took 0.27906370162963867s.
+[triton-dejavu] First execution including JIT compilation took 0.5023193359375s.
+[triton-dejavu] First execution including JIT compilation took 0.4139993190765381s.
+[triton-dejavu] First execution including JIT compilation took 0.300579309463501s.
+[triton-dejavu] First execution including JIT compilation took 0.529712438583374s.
+[triton-dejavu] First execution including JIT compilation took 0.43097352981567383s.
+[triton-dejavu] First execution including JIT compilation took 0.3235909938812256s.
+[triton-dejavu] First execution including JIT compilation took 0.5673091411590576s.
+[triton-dejavu] First execution including JIT compilation took 0.4577775001525879s.
+[triton-dejavu] First execution including JIT compilation took 0.34275031089782715s.
+[triton-dejavu] First execution including JIT compilation took 0.6338176727294922s.
+[triton-dejavu] First execution including JIT compilation took 0.49797868728637695s.
+[triton-dejavu] First execution including JIT compilation took 0.37319207191467285s.
+[triton-dejavu] First execution including JIT compilation took 0.47208404541015625s.
+[triton-dejavu] First execution including JIT compilation took 0.33609509468078613s.
+[triton-dejavu] First execution including JIT compilation took 0.30238795280456543s.
+[triton-dejavu] First execution including JIT compilation took 0.7173871994018555s.
+[triton-dejavu] First execution including JIT compilation took 0.5442206859588623s.
+[triton-dejavu] First execution including JIT compilation took 0.511505126953125s.
+[triton-dejavu] First execution including JIT compilation took 0.8165128231048584s.
+[triton-dejavu] First execution including JIT compilation took 0.583181619644165s.
+[triton-dejavu] First execution including JIT compilation took 0.5342910289764404s.
+[triton-dejavu] First execution including JIT compilation took 0.9290053844451904s.
+[triton-dejavu] First execution including JIT compilation took 0.6662936210632324s.
+[triton-dejavu] First execution including JIT compilation took 0.5730347633361816s.
+[triton-dejavu] First execution including JIT compilation took 0.9860448837280273s.
+[triton-dejavu] First execution including JIT compilation took 0.7075350284576416s.
+[triton-dejavu] First execution including JIT compilation took 0.593177080154419s.
+[triton-dejavu] First execution including JIT compilation took 1.0206115245819092s.
+[triton-dejavu] First execution including JIT compilation took 0.6365783214569092s.
+[triton-dejavu] First execution including JIT compilation took 0.5057172775268555s.
+[triton-dejavu] First execution including JIT compilation took 0.9307384490966797s.
+[triton-dejavu] First execution including JIT compilation took 0.6267914772033691s.
+[triton-dejavu] First execution including JIT compilation took 0.5471899509429932s.
+[triton-dejavu] First execution including JIT compilation took 0.1969449520111084s.
+[triton-dejavu] First execution including JIT compilation took 0.1712944507598877s.
+[triton-dejavu] First execution including JIT compilation took 0.1570436954498291s.
+[triton-dejavu] First execution including JIT compilation took 0.2668495178222656s.
+[triton-dejavu] First execution including JIT compilation took 0.2014913558959961s.
+[triton-dejavu] First execution including JIT compilation took 0.17658185958862305s.
+[triton-dejavu] First execution including JIT compilation took 0.24475407600402832s.
+[triton-dejavu] First execution including JIT compilation took 0.20715713500976562s.
+[triton-dejavu] First execution including JIT compilation took 0.18568778038024902s.
+[triton-dejavu] First execution including JIT compilation took 0.2623903751373291s.
+[triton-dejavu] First execution including JIT compilation took 0.20713019371032715s.
+[triton-dejavu] First execution including JIT compilation took 0.17886114120483398s.
+[triton-dejavu] First execution including JIT compilation took 0.2510707378387451s.
+[triton-dejavu] First execution including JIT compilation took 0.21624040603637695s.
+[triton-dejavu] First execution including JIT compilation took 0.20012712478637695s.
+[triton-dejavu] First execution including JIT compilation took 0.2688755989074707s.
+[triton-dejavu] First execution including JIT compilation took 0.2036607265472412s.
+[triton-dejavu] First execution including JIT compilation took 0.21555137634277344s.
+[triton-dejavu] First execution including JIT compilation took 0.272658109664917s.
+[triton-dejavu] First execution including JIT compilation took 0.2918975353240967s.
+[triton-dejavu] First execution including JIT compilation took 0.22692298889160156s.
+[triton-dejavu] First execution including JIT compilation took 0.20147228240966797s.
+[triton-dejavu] First execution including JIT compilation took 0.17040586471557617s.
+[triton-dejavu] First execution including JIT compilation took 0.1717395782470703s.
+[triton-dejavu] First execution including JIT compilation took 0.2556333541870117s.
+[triton-dejavu] First execution including JIT compilation took 0.19439339637756348s.
+[triton-dejavu] First execution including JIT compilation took 0.18878650665283203s.
+[triton-dejavu] First execution including JIT compilation took 0.28153157234191895s.
+[triton-dejavu] First execution including JIT compilation took 0.22823047637939453s.
+[triton-dejavu] First execution including JIT compilation took 0.20015215873718262s.
+[triton-dejavu] First execution including JIT compilation took 0.2893240451812744s.
+[triton-dejavu] First execution including JIT compilation took 0.2234363555908203s.
+[triton-dejavu] First execution including JIT compilation took 0.20252442359924316s.
+[triton-dejavu] First execution including JIT compilation took 0.29529833793640137s.
+[triton-dejavu] First execution including JIT compilation took 0.25741052627563477s.
+[triton-dejavu] First execution including JIT compilation took 0.22293853759765625s.
+[triton-dejavu] First execution including JIT compilation took 0.32663512229919434s.
+[triton-dejavu] First execution including JIT compilation took 0.257922887802124s.
+[triton-dejavu] First execution including JIT compilation took 0.2501180171966553s.
+[triton-dejavu] First execution including JIT compilation took 0.3506193161010742s.
+[triton-dejavu] First execution including JIT compilation took 0.272749662399292s.
+[triton-dejavu] First execution including JIT compilation took 0.243269681930542s.
+[triton-dejavu] First execution including JIT compilation took 0.2345433235168457s.
+[triton-dejavu] First execution including JIT compilation took 0.21488571166992188s.
+[triton-dejavu] First execution including JIT compilation took 0.18851923942565918s.
+[triton-dejavu] First execution including JIT compilation took 0.29990649223327637s.
+[triton-dejavu] First execution including JIT compilation took 0.2599034309387207s.
+[triton-dejavu] First execution including JIT compilation took 0.21689176559448242s.
+[triton-dejavu] First execution including JIT compilation took 0.361560583114624s.
+[triton-dejavu] First execution including JIT compilation took 0.27080440521240234s.
+[triton-dejavu] First execution including JIT compilation took 0.22725224494934082s.
+[triton-dejavu] First execution including JIT compilation took 0.34572768211364746s.
+[triton-dejavu] First execution including JIT compilation took 0.2681708335876465s.
+[triton-dejavu] First execution including JIT compilation took 0.22074484825134277s.
+[triton-dejavu] First execution including JIT compilation took 0.3579220771789551s.
+[triton-dejavu] First execution including JIT compilation took 0.2913625240325928s.
+[triton-dejavu] First execution including JIT compilation took 0.27397990226745605s.
+[triton-dejavu] First execution including JIT compilation took 0.36322855949401855s.
+[triton-dejavu] First execution including JIT compilation took 0.36347508430480957s.
+[triton-dejavu] First execution including JIT compilation took 0.2753303050994873s.
+[triton-dejavu] First execution including JIT compilation took 0.4066603183746338s.
+[triton-dejavu] First execution including JIT compilation took 0.4136660099029541s.
+[triton-dejavu] First execution including JIT compilation took 0.29329895973205566s.
+[triton-dejavu] First execution including JIT compilation took 0.37958860397338867s.
+[triton-dejavu] First execution including JIT compilation took 0.24896860122680664s.
+[triton-dejavu] First execution including JIT compilation took 0.21965575218200684s.
+[triton-dejavu] First execution including JIT compilation took 0.4879426956176758s.
+[triton-dejavu] First execution including JIT compilation took 0.33871960639953613s.
+[triton-dejavu] First execution including JIT compilation took 0.24471020698547363s.
+[triton-dejavu] First execution including JIT compilation took 0.4965670108795166s.
+[triton-dejavu] First execution including JIT compilation took 0.40749454498291016s.
+[triton-dejavu] First execution including JIT compilation took 0.2844102382659912s.
+[triton-dejavu] First execution including JIT compilation took 0.6162877082824707s.
+[triton-dejavu] First execution including JIT compilation took 0.37363123893737793s.
+[triton-dejavu] First execution including JIT compilation took 0.30038881301879883s.
+[triton-dejavu] First execution including JIT compilation took 0.6782312393188477s.
+[triton-dejavu] First execution including JIT compilation took 0.395599365234375s.
+[triton-dejavu] First execution including JIT compilation took 0.31715917587280273s.
+[triton-dejavu] First execution including JIT compilation took 0.6199126243591309s.
+[triton-dejavu] First execution including JIT compilation took 0.4322071075439453s.
+[triton-dejavu] First execution including JIT compilation took 0.3455088138580322s.
+[triton-dejavu] First execution including JIT compilation took 0.8280949592590332s.
+[triton-dejavu] First execution including JIT compilation took 0.5218696594238281s.
+[triton-dejavu] First execution including JIT compilation took 0.36759161949157715s.
+[triton-dejavu] First execution including JIT compilation took 0.601407527923584s.
+[triton-dejavu] First execution including JIT compilation took 0.36752843856811523s.
+[triton-dejavu] First execution including JIT compilation took 0.273007869720459s.
+[triton-dejavu] First execution including JIT compilation took 0.8815312385559082s.
+[triton-dejavu] First execution including JIT compilation took 0.5408468246459961s.
+[triton-dejavu] First execution including JIT compilation took 0.4321472644805908s.
+[triton-dejavu] First execution including JIT compilation took 1.340597152709961s.
+[triton-dejavu] First execution including JIT compilation took 0.6468391418457031s.
+[triton-dejavu] First execution including JIT compilation took 0.4674386978149414s.
+[triton-dejavu] First execution including JIT compilation took 1.4745817184448242s.
+[triton-dejavu] First execution including JIT compilation took 0.7319414615631104s.
+[triton-dejavu] First execution including JIT compilation took 0.4820535182952881s.
+[triton-dejavu] First execution including JIT compilation took 1.6843111515045166s.
+[triton-dejavu] First execution including JIT compilation took 0.7272007465362549s.
+[triton-dejavu] First execution including JIT compilation took 0.5684032440185547s.
+[triton-dejavu] First execution including JIT compilation took 1.6687507629394531s.
+[triton-dejavu] First execution including JIT compilation took 0.7634897232055664s.
+[triton-dejavu] First execution including JIT compilation took 0.5958552360534668s.
+bench_cudagraph failed with out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.21318602561950684s.
+[triton-dejavu] First execution including JIT compilation took 0.21699094772338867s.
+[triton-dejavu] First execution including JIT compilation took 0.18067646026611328s.
+[triton-dejavu] First execution including JIT compilation took 0.2883434295654297s.
+[triton-dejavu] First execution including JIT compilation took 0.20682692527770996s.
+[triton-dejavu] First execution including JIT compilation took 0.19046735763549805s.
+[triton-dejavu] First execution including JIT compilation took 0.2932305335998535s.
+[triton-dejavu] First execution including JIT compilation took 0.21595001220703125s.
+[triton-dejavu] First execution including JIT compilation took 0.18168210983276367s.
+[triton-dejavu] First execution including JIT compilation took 0.32982873916625977s.
+[triton-dejavu] First execution including JIT compilation took 0.22777533531188965s.
+[triton-dejavu] First execution including JIT compilation took 0.2037661075592041s.
+[triton-dejavu] First execution including JIT compilation took 0.31747984886169434s.
+[triton-dejavu] First execution including JIT compilation took 0.28336167335510254s.
+[triton-dejavu] First execution including JIT compilation took 0.19774699211120605s.
+[triton-dejavu] First execution including JIT compilation took 0.3306758403778076s.
+[triton-dejavu] First execution including JIT compilation took 0.2724292278289795s.
+[triton-dejavu] First execution including JIT compilation took 0.22767090797424316s.
+[triton-dejavu] First execution including JIT compilation took 0.3717081546783447s.
+[triton-dejavu] First execution including JIT compilation took 0.2847135066986084s.
+[triton-dejavu] First execution including JIT compilation took 0.2544288635253906s.
+[triton-dejavu] First execution including JIT compilation took 0.2563972473144531s.
+[triton-dejavu] First execution including JIT compilation took 0.21262860298156738s.
+[triton-dejavu] First execution including JIT compilation took 0.2203054428100586s.
+[triton-dejavu] First execution including JIT compilation took 0.3555338382720947s.
+[triton-dejavu] First execution including JIT compilation took 0.25258374214172363s.
+[triton-dejavu] First execution including JIT compilation took 0.22145795822143555s.
+[triton-dejavu] First execution including JIT compilation took 0.39704275131225586s.
+[triton-dejavu] First execution including JIT compilation took 0.26523470878601074s.
+[triton-dejavu] First execution including JIT compilation took 0.21595096588134766s.
+[triton-dejavu] First execution including JIT compilation took 0.4347100257873535s.
+[triton-dejavu] First execution including JIT compilation took 0.29169178009033203s.
+[triton-dejavu] First execution including JIT compilation took 0.21956300735473633s.
+[triton-dejavu] First execution including JIT compilation took 0.4330458641052246s.
+[triton-dejavu] First execution including JIT compilation took 0.31913185119628906s.
+[triton-dejavu] First execution including JIT compilation took 0.2509474754333496s.
+[triton-dejavu] First execution including JIT compilation took 0.48702025413513184s.
+[triton-dejavu] First execution including JIT compilation took 0.32025718688964844s.
+[triton-dejavu] First execution including JIT compilation took 0.2625458240509033s.
+[triton-dejavu] First execution including JIT compilation took 0.551466703414917s.
+[triton-dejavu] First execution including JIT compilation took 0.37408924102783203s.
+[triton-dejavu] First execution including JIT compilation took 0.27136731147766113s.
+[triton-dejavu] First execution including JIT compilation took 0.33800768852233887s.
+[triton-dejavu] First execution including JIT compilation took 0.26560330390930176s.
+[triton-dejavu] First execution including JIT compilation took 0.20183563232421875s.
+[triton-dejavu] First execution including JIT compilation took 0.40157294273376465s.
+[triton-dejavu] First execution including JIT compilation took 0.3323667049407959s.
+[triton-dejavu] First execution including JIT compilation took 0.2476518154144287s.
+[triton-dejavu] First execution including JIT compilation took 0.5237414836883545s.
+[triton-dejavu] First execution including JIT compilation took 0.38099026679992676s.
+[triton-dejavu] First execution including JIT compilation took 0.25824856758117676s.
+[triton-dejavu] First execution including JIT compilation took 0.5798733234405518s.
+[triton-dejavu] First execution including JIT compilation took 0.4060328006744385s.
+[triton-dejavu] First execution including JIT compilation took 0.299180269241333s.
+[triton-dejavu] First execution including JIT compilation took 0.5705587863922119s.
+[triton-dejavu] First execution including JIT compilation took 0.43184709548950195s.
+[triton-dejavu] First execution including JIT compilation took 0.29991817474365234s.
+[triton-dejavu] First execution including JIT compilation took 0.5768892765045166s.
+[triton-dejavu] First execution including JIT compilation took 0.5104458332061768s.
+[triton-dejavu] First execution including JIT compilation took 0.36955881118774414s.
+[triton-dejavu] First execution including JIT compilation took 0.6489105224609375s.
+[triton-dejavu] First execution including JIT compilation took 0.5593419075012207s.
+[triton-dejavu] First execution including JIT compilation took 0.3752884864807129s.
+[triton-dejavu] First execution including JIT compilation took 0.6494286060333252s.
+[triton-dejavu] First execution including JIT compilation took 0.35906028747558594s.
+[triton-dejavu] First execution including JIT compilation took 0.2642378807067871s.
+[triton-dejavu] First execution including JIT compilation took 0.7536261081695557s.
+[triton-dejavu] First execution including JIT compilation took 0.4381115436553955s.
+[triton-dejavu] First execution including JIT compilation took 0.3165860176086426s.
+[triton-dejavu] First execution including JIT compilation took 1.265178918838501s.
+[triton-dejavu] First execution including JIT compilation took 0.5233526229858398s.
+[triton-dejavu] First execution including JIT compilation took 0.3574998378753662s.
+[triton-dejavu] First execution including JIT compilation took 1.300689697265625s.
+[triton-dejavu] First execution including JIT compilation took 0.6212594509124756s.
+[triton-dejavu] First execution including JIT compilation took 0.4114842414855957s.
+[triton-dejavu] First execution including JIT compilation took 1.3530914783477783s.
+[triton-dejavu] First execution including JIT compilation took 0.6589765548706055s.
+[triton-dejavu] First execution including JIT compilation took 0.4349792003631592s.
+[triton-dejavu] First execution including JIT compilation took 1.412661075592041s.
+[triton-dejavu] First execution including JIT compilation took 0.7154123783111572s.
+[triton-dejavu] First execution including JIT compilation took 0.5011796951293945s.
+bench_cudagraph failed with out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.3227477073669434s.
+[triton-dejavu] First execution including JIT compilation took 0.6530427932739258s.
+[triton-dejavu] First execution including JIT compilation took 0.35028672218322754s.
+[triton-dejavu] First execution including JIT compilation took 2.6119463443756104s.
+[triton-dejavu] First execution including JIT compilation took 0.884284257888794s.
+[triton-dejavu] First execution including JIT compilation took 0.5999755859375s.
+[triton-dejavu] First execution including JIT compilation took 6.0120015144348145s.
+[triton-dejavu] First execution including JIT compilation took 1.4350576400756836s.
+[triton-dejavu] First execution including JIT compilation took 0.6809098720550537s.
+[triton-dejavu] First execution including JIT compilation took 6.039306402206421s.
+[triton-dejavu] First execution including JIT compilation took 1.520536184310913s.
+[triton-dejavu] First execution including JIT compilation took 0.7370305061340332s.
+bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.29415297508239746s.
+[triton-dejavu] First execution including JIT compilation took 0.2148146629333496s.
+[triton-dejavu] First execution including JIT compilation took 0.20547103881835938s.
+[triton-dejavu] First execution including JIT compilation took 0.4116225242614746s.
+[triton-dejavu] First execution including JIT compilation took 0.27173876762390137s.
+[triton-dejavu] First execution including JIT compilation took 0.2047574520111084s.
+[triton-dejavu] First execution including JIT compilation took 0.4177091121673584s.
+[triton-dejavu] First execution including JIT compilation took 0.2812669277191162s.
+[triton-dejavu] First execution including JIT compilation took 0.25458741188049316s.
+[triton-dejavu] First execution including JIT compilation took 0.4544975757598877s.
+[triton-dejavu] First execution including JIT compilation took 0.30648016929626465s.
+[triton-dejavu] First execution including JIT compilation took 0.2235255241394043s.
+[triton-dejavu] First execution including JIT compilation took 0.47878551483154297s.
+[triton-dejavu] First execution including JIT compilation took 0.3247034549713135s.
+[triton-dejavu] First execution including JIT compilation took 0.2551548480987549s.
+[triton-dejavu] First execution including JIT compilation took 0.6796090602874756s.
+[triton-dejavu] First execution including JIT compilation took 0.3536677360534668s.
+[triton-dejavu] First execution including JIT compilation took 0.2720470428466797s.
+[triton-dejavu] First execution including JIT compilation took 0.8124823570251465s.
+[triton-dejavu] First execution including JIT compilation took 0.4978444576263428s.
+[triton-dejavu] First execution including JIT compilation took 0.35080695152282715s.
+[triton-dejavu] First execution including JIT compilation took 0.5138082504272461s.
+[triton-dejavu] First execution including JIT compilation took 0.3385753631591797s.
+[triton-dejavu] First execution including JIT compilation took 0.2594444751739502s.
+[triton-dejavu] First execution including JIT compilation took 0.6842827796936035s.
+[triton-dejavu] First execution including JIT compilation took 0.4294295310974121s.
+[triton-dejavu] First execution including JIT compilation took 0.3218364715576172s.
+[triton-dejavu] First execution including JIT compilation took 0.8480286598205566s.
+[triton-dejavu] First execution including JIT compilation took 0.472670316696167s.
+[triton-dejavu] First execution including JIT compilation took 0.36148762702941895s.
+[triton-dejavu] First execution including JIT compilation took 0.9478855133056641s.
+[triton-dejavu] First execution including JIT compilation took 0.5432147979736328s.
+[triton-dejavu] First execution including JIT compilation took 0.38411760330200195s.
+[triton-dejavu] First execution including JIT compilation took 1.0501837730407715s.
+[triton-dejavu] First execution including JIT compilation took 0.5907988548278809s.
+[triton-dejavu] First execution including JIT compilation took 0.39850473403930664s.
+[triton-dejavu] First execution including JIT compilation took 1.1722888946533203s.
+[triton-dejavu] First execution including JIT compilation took 0.6436972618103027s.
+[triton-dejavu] First execution including JIT compilation took 0.42680954933166504s.
+[triton-dejavu] First execution including JIT compilation took 1.3340017795562744s.
+[triton-dejavu] First execution including JIT compilation took 0.5718722343444824s.
+[triton-dejavu] First execution including JIT compilation took 0.38933897018432617s.
+[triton-dejavu] First execution including JIT compilation took 0.6949644088745117s.
+[triton-dejavu] First execution including JIT compilation took 0.3732309341430664s.
+[triton-dejavu] First execution including JIT compilation took 0.26645493507385254s.
+[triton-dejavu] First execution including JIT compilation took 0.6677834987640381s.
+[triton-dejavu] First execution including JIT compilation took 0.5330057144165039s.
+[triton-dejavu] First execution including JIT compilation took 0.3234426975250244s.
+[triton-dejavu] First execution including JIT compilation took 1.226240634918213s.
+[triton-dejavu] First execution including JIT compilation took 0.7037711143493652s.
+[triton-dejavu] First execution including JIT compilation took 0.35811614990234375s.
+[triton-dejavu] First execution including JIT compilation took 1.223371982574463s.
+[triton-dejavu] First execution including JIT compilation took 0.7030131816864014s.
+[triton-dejavu] First execution including JIT compilation took 0.44534802436828613s.
+[triton-dejavu] First execution including JIT compilation took 1.3601360321044922s.
+[triton-dejavu] First execution including JIT compilation took 0.8273930549621582s.
+[triton-dejavu] First execution including JIT compilation took 0.4698348045349121s.
+[triton-dejavu] First execution including JIT compilation took 1.3899588584899902s.
+[triton-dejavu] First execution including JIT compilation took 0.9071271419525146s.
+[triton-dejavu] First execution including JIT compilation took 0.47567152976989746s.
+bench_cudagraph failed with out of resource: shared memory, Required: 249088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 281856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 281856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.3848598003387451s.
+[triton-dejavu] First execution including JIT compilation took 0.5705807209014893s.
+[triton-dejavu] First execution including JIT compilation took 0.3730134963989258s.
+[triton-dejavu] First execution including JIT compilation took 1.7507147789001465s.
+[triton-dejavu] First execution including JIT compilation took 0.739149808883667s.
+[triton-dejavu] First execution including JIT compilation took 0.5719029903411865s.
+[triton-dejavu] First execution including JIT compilation took 5.391368865966797s.
+[triton-dejavu] First execution including JIT compilation took 1.2137444019317627s.
+[triton-dejavu] First execution including JIT compilation took 0.6304950714111328s.
+[triton-dejavu] First execution including JIT compilation took 5.735509157180786s.
+[triton-dejavu] First execution including JIT compilation took 1.278113603591919s.
+bench_cudagraph failed with out of resource: shared memory, Required: 279040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 279040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 350208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 350208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 421376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 421376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 563712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 563712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 4.086903095245361s.
+[triton-dejavu] First execution including JIT compilation took 1.3743896484375s.
+[triton-dejavu] First execution including JIT compilation took 0.5919761657714844s.
+[triton-dejavu] First execution including JIT compilation took 5.130200147628784s.
+[triton-dejavu] First execution including JIT compilation took 1.6156775951385498s.
+[triton-dejavu] First execution including JIT compilation took 0.8351900577545166s.
+bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.17352676391601562s.
+[triton-dejavu] First execution including JIT compilation took 0.16042280197143555s.
+[triton-dejavu] First execution including JIT compilation took 0.15305304527282715s.
+[triton-dejavu] First execution including JIT compilation took 0.17835307121276855s.
+[triton-dejavu] First execution including JIT compilation took 0.18025875091552734s.
+[triton-dejavu] First execution including JIT compilation took 0.1706397533416748s.
+[triton-dejavu] First execution including JIT compilation took 0.18474626541137695s.
+[triton-dejavu] First execution including JIT compilation took 0.1816706657409668s.
+[triton-dejavu] First execution including JIT compilation took 0.17277026176452637s.
+[triton-dejavu] First execution including JIT compilation took 0.19310617446899414s.
+[triton-dejavu] First execution including JIT compilation took 0.19066524505615234s.
+[triton-dejavu] First execution including JIT compilation took 0.18411707878112793s.
+[triton-dejavu] First execution including JIT compilation took 0.20400023460388184s.
+[triton-dejavu] First execution including JIT compilation took 0.21816563606262207s.
+[triton-dejavu] First execution including JIT compilation took 0.19965767860412598s.
+[triton-dejavu] First execution including JIT compilation took 0.23587703704833984s.
+[triton-dejavu] First execution including JIT compilation took 0.22630786895751953s.
+[triton-dejavu] First execution including JIT compilation took 0.20632338523864746s.
+[triton-dejavu] First execution including JIT compilation took 0.26639699935913086s.
+[triton-dejavu] First execution including JIT compilation took 0.24132418632507324s.
+[triton-dejavu] First execution including JIT compilation took 0.22358036041259766s.
+[triton-dejavu] First execution including JIT compilation took 0.1937398910522461s.
+[triton-dejavu] First execution including JIT compilation took 0.15766239166259766s.
+[triton-dejavu] First execution including JIT compilation took 0.1767878532409668s.
+[triton-dejavu] First execution including JIT compilation took 0.19301867485046387s.
+[triton-dejavu] First execution including JIT compilation took 0.19379115104675293s.
+[triton-dejavu] First execution including JIT compilation took 0.17388463020324707s.
+[triton-dejavu] First execution including JIT compilation took 0.2268962860107422s.
+[triton-dejavu] First execution including JIT compilation took 0.2022707462310791s.
+[triton-dejavu] First execution including JIT compilation took 0.19673395156860352s.
+[triton-dejavu] First execution including JIT compilation took 0.22176003456115723s.
+[triton-dejavu] First execution including JIT compilation took 0.20571017265319824s.
+[triton-dejavu] First execution including JIT compilation took 0.21979308128356934s.
+[triton-dejavu] First execution including JIT compilation took 0.23570489883422852s.
+[triton-dejavu] First execution including JIT compilation took 0.22661423683166504s.
+[triton-dejavu] First execution including JIT compilation took 0.22267603874206543s.
+[triton-dejavu] First execution including JIT compilation took 0.24321699142456055s.
+[triton-dejavu] First execution including JIT compilation took 0.23399901390075684s.
+[triton-dejavu] First execution including JIT compilation took 0.22303104400634766s.
+[triton-dejavu] First execution including JIT compilation took 0.28810644149780273s.
+[triton-dejavu] First execution including JIT compilation took 0.23264646530151367s.
+[triton-dejavu] First execution including JIT compilation took 0.2400953769683838s.
+[triton-dejavu] First execution including JIT compilation took 0.21341371536254883s.
+[triton-dejavu] First execution including JIT compilation took 0.19530224800109863s.
+[triton-dejavu] First execution including JIT compilation took 0.1753242015838623s.
+[triton-dejavu] First execution including JIT compilation took 0.24271106719970703s.
+[triton-dejavu] First execution including JIT compilation took 0.22337555885314941s.
+[triton-dejavu] First execution including JIT compilation took 0.20344924926757812s.
+[triton-dejavu] First execution including JIT compilation took 0.26564812660217285s.
+[triton-dejavu] First execution including JIT compilation took 0.22059965133666992s.
+[triton-dejavu] First execution including JIT compilation took 0.19876718521118164s.
+[triton-dejavu] First execution including JIT compilation took 0.3027980327606201s.
+[triton-dejavu] First execution including JIT compilation took 0.2440967559814453s.
+[triton-dejavu] First execution including JIT compilation took 0.21737980842590332s.
+[triton-dejavu] First execution including JIT compilation took 0.3214104175567627s.
+[triton-dejavu] First execution including JIT compilation took 0.23887038230895996s.
+[triton-dejavu] First execution including JIT compilation took 0.22879958152770996s.
+[triton-dejavu] First execution including JIT compilation took 0.31365513801574707s.
+[triton-dejavu] First execution including JIT compilation took 0.2629280090332031s.
+[triton-dejavu] First execution including JIT compilation took 0.22771596908569336s.
+[triton-dejavu] First execution including JIT compilation took 0.40690040588378906s.
+[triton-dejavu] First execution including JIT compilation took 0.32520389556884766s.
+[triton-dejavu] First execution including JIT compilation took 0.2640228271484375s.
+[triton-dejavu] First execution including JIT compilation took 0.2796199321746826s.
+[triton-dejavu] First execution including JIT compilation took 0.21073007583618164s.
+[triton-dejavu] First execution including JIT compilation took 0.1934361457824707s.
+[triton-dejavu] First execution including JIT compilation took 0.34839892387390137s.
+[triton-dejavu] First execution including JIT compilation took 0.3115088939666748s.
+[triton-dejavu] First execution including JIT compilation took 0.20244383811950684s.
+[triton-dejavu] First execution including JIT compilation took 0.38748598098754883s.
+[triton-dejavu] First execution including JIT compilation took 0.3139615058898926s.
+[triton-dejavu] First execution including JIT compilation took 0.22042202949523926s.
+[triton-dejavu] First execution including JIT compilation took 0.4271514415740967s.
+[triton-dejavu] First execution including JIT compilation took 0.34604549407958984s.
+[triton-dejavu] First execution including JIT compilation took 0.22012782096862793s.
+[triton-dejavu] First execution including JIT compilation took 0.5421981811523438s.
+[triton-dejavu] First execution including JIT compilation took 0.3638300895690918s.
+[triton-dejavu] First execution including JIT compilation took 0.23948025703430176s.
+[triton-dejavu] First execution including JIT compilation took 0.4606790542602539s.
+[triton-dejavu] First execution including JIT compilation took 0.3932468891143799s.
+[triton-dejavu] First execution including JIT compilation took 0.26195645332336426s.
+[triton-dejavu] First execution including JIT compilation took 0.5043284893035889s.
+[triton-dejavu] First execution including JIT compilation took 0.4588782787322998s.
+[triton-dejavu] First execution including JIT compilation took 0.2814829349517822s.
+[triton-dejavu] First execution including JIT compilation took 0.39888525009155273s.
+[triton-dejavu] First execution including JIT compilation took 0.2917821407318115s.
+[triton-dejavu] First execution including JIT compilation took 0.2808530330657959s.
+[triton-dejavu] First execution including JIT compilation took 0.6014502048492432s.
+[triton-dejavu] First execution including JIT compilation took 0.5006814002990723s.
+[triton-dejavu] First execution including JIT compilation took 0.4271118640899658s.
+[triton-dejavu] First execution including JIT compilation took 0.7629735469818115s.
+[triton-dejavu] First execution including JIT compilation took 0.49445056915283203s.
+[triton-dejavu] First execution including JIT compilation took 0.4509761333465576s.
+[triton-dejavu] First execution including JIT compilation took 0.7416894435882568s.
+[triton-dejavu] First execution including JIT compilation took 0.5215311050415039s.
+[triton-dejavu] First execution including JIT compilation took 0.4689524173736572s.
+[triton-dejavu] First execution including JIT compilation took 0.7672626972198486s.
+[triton-dejavu] First execution including JIT compilation took 0.6135916709899902s.
+[triton-dejavu] First execution including JIT compilation took 0.49275946617126465s.
+[triton-dejavu] First execution including JIT compilation took 0.9401953220367432s.
+[triton-dejavu] First execution including JIT compilation took 0.5827491283416748s.
+[triton-dejavu] First execution including JIT compilation took 0.4996645450592041s.
+[triton-dejavu] First execution including JIT compilation took 0.9401323795318604s.
+[triton-dejavu] First execution including JIT compilation took 0.668349027633667s.
+[triton-dejavu] First execution including JIT compilation took 0.5485968589782715s.
+[triton-dejavu] First execution including JIT compilation took 0.18768930435180664s.
+[triton-dejavu] First execution including JIT compilation took 0.16661763191223145s.
+[triton-dejavu] First execution including JIT compilation took 0.17052841186523438s.
+[triton-dejavu] First execution including JIT compilation took 0.2100682258605957s.
+[triton-dejavu] First execution including JIT compilation took 0.20937323570251465s.
+[triton-dejavu] First execution including JIT compilation took 0.19020938873291016s.
+[triton-dejavu] First execution including JIT compilation took 0.20560169219970703s.
+[triton-dejavu] First execution including JIT compilation took 0.19290709495544434s.
+[triton-dejavu] First execution including JIT compilation took 0.19777560234069824s.
+[triton-dejavu] First execution including JIT compilation took 0.21995210647583008s.
+[triton-dejavu] First execution including JIT compilation took 0.21872901916503906s.
+[triton-dejavu] First execution including JIT compilation took 0.2034306526184082s.
+[triton-dejavu] First execution including JIT compilation took 0.24239134788513184s.
+[triton-dejavu] First execution including JIT compilation took 0.26946043968200684s.
+[triton-dejavu] First execution including JIT compilation took 0.2544829845428467s.
+[triton-dejavu] First execution including JIT compilation took 0.2922830581665039s.
+[triton-dejavu] First execution including JIT compilation took 0.27474284172058105s.
+[triton-dejavu] First execution including JIT compilation took 0.2743556499481201s.
+[triton-dejavu] First execution including JIT compilation took 0.31418609619140625s.
+[triton-dejavu] First execution including JIT compilation took 0.30026888847351074s.
+[triton-dejavu] First execution including JIT compilation took 0.28441858291625977s.
+[triton-dejavu] First execution including JIT compilation took 0.23394203186035156s.
+[triton-dejavu] First execution including JIT compilation took 0.20772600173950195s.
+[triton-dejavu] First execution including JIT compilation took 0.18996429443359375s.
+[triton-dejavu] First execution including JIT compilation took 0.27438807487487793s.
+[triton-dejavu] First execution including JIT compilation took 0.23485589027404785s.
+[triton-dejavu] First execution including JIT compilation took 0.2199420928955078s.
+[triton-dejavu] First execution including JIT compilation took 0.29147887229919434s.
+[triton-dejavu] First execution including JIT compilation took 0.2452247142791748s.
+[triton-dejavu] First execution including JIT compilation took 0.2305736541748047s.
+[triton-dejavu] First execution including JIT compilation took 0.30902743339538574s.
+[triton-dejavu] First execution including JIT compilation took 0.2559957504272461s.
+[triton-dejavu] First execution including JIT compilation took 0.2465808391571045s.
+[triton-dejavu] First execution including JIT compilation took 0.3298933506011963s.
+[triton-dejavu] First execution including JIT compilation took 0.27321410179138184s.
+[triton-dejavu] First execution including JIT compilation took 0.26524877548217773s.
+[triton-dejavu] First execution including JIT compilation took 0.34816527366638184s.
+[triton-dejavu] First execution including JIT compilation took 0.28248119354248047s.
+[triton-dejavu] First execution including JIT compilation took 0.267411470413208s.
+[triton-dejavu] First execution including JIT compilation took 0.4036557674407959s.
+[triton-dejavu] First execution including JIT compilation took 0.30405187606811523s.
+[triton-dejavu] First execution including JIT compilation took 0.3068065643310547s.
+[triton-dejavu] First execution including JIT compilation took 0.2875032424926758s.
+[triton-dejavu] First execution including JIT compilation took 0.23735547065734863s.
+[triton-dejavu] First execution including JIT compilation took 0.23032617568969727s.
+[triton-dejavu] First execution including JIT compilation took 0.3493824005126953s.
+[triton-dejavu] First execution including JIT compilation took 0.27472662925720215s.
+[triton-dejavu] First execution including JIT compilation took 0.2401866912841797s.
+[triton-dejavu] First execution including JIT compilation took 0.39062976837158203s.
+[triton-dejavu] First execution including JIT compilation took 0.29250192642211914s.
+[triton-dejavu] First execution including JIT compilation took 0.2570502758026123s.
+[triton-dejavu] First execution including JIT compilation took 0.3952975273132324s.
+[triton-dejavu] First execution including JIT compilation took 0.31146764755249023s.
+[triton-dejavu] First execution including JIT compilation took 0.26107025146484375s.
+[triton-dejavu] First execution including JIT compilation took 0.43129730224609375s.
+[triton-dejavu] First execution including JIT compilation took 0.3286442756652832s.
+[triton-dejavu] First execution including JIT compilation took 0.2835230827331543s.
+[triton-dejavu] First execution including JIT compilation took 0.43753743171691895s.
+[triton-dejavu] First execution including JIT compilation took 0.34508848190307617s.
+[triton-dejavu] First execution including JIT compilation took 0.2861642837524414s.
+[triton-dejavu] First execution including JIT compilation took 0.48541975021362305s.
+[triton-dejavu] First execution including JIT compilation took 0.3953580856323242s.
+[triton-dejavu] First execution including JIT compilation took 0.31298136711120605s.
+[triton-dejavu] First execution including JIT compilation took 0.3618011474609375s.
+[triton-dejavu] First execution including JIT compilation took 0.29604053497314453s.
+[triton-dejavu] First execution including JIT compilation took 0.2307584285736084s.
+[triton-dejavu] First execution including JIT compilation took 0.4865717887878418s.
+[triton-dejavu] First execution including JIT compilation took 0.40287113189697266s.
+[triton-dejavu] First execution including JIT compilation took 0.27056026458740234s.
+[triton-dejavu] First execution including JIT compilation took 0.5447485446929932s.
+[triton-dejavu] First execution including JIT compilation took 0.430034875869751s.
+[triton-dejavu] First execution including JIT compilation took 0.30031275749206543s.
+[triton-dejavu] First execution including JIT compilation took 0.6063799858093262s.
+[triton-dejavu] First execution including JIT compilation took 0.4732537269592285s.
+[triton-dejavu] First execution including JIT compilation took 0.3194100856781006s.
+[triton-dejavu] First execution including JIT compilation took 0.6529583930969238s.
+[triton-dejavu] First execution including JIT compilation took 0.4868447780609131s.
+[triton-dejavu] First execution including JIT compilation took 0.35962390899658203s.
+[triton-dejavu] First execution including JIT compilation took 0.6952221393585205s.
+[triton-dejavu] First execution including JIT compilation took 0.5078432559967041s.
+[triton-dejavu] First execution including JIT compilation took 0.3716623783111572s.
+[triton-dejavu] First execution including JIT compilation took 0.7688384056091309s.
+[triton-dejavu] First execution including JIT compilation took 0.5738773345947266s.
+[triton-dejavu] First execution including JIT compilation took 0.40444135665893555s.
+[triton-dejavu] First execution including JIT compilation took 0.5380966663360596s.
+[triton-dejavu] First execution including JIT compilation took 0.4179868698120117s.
+[triton-dejavu] First execution including JIT compilation took 0.2959005832672119s.
+[triton-dejavu] First execution including JIT compilation took 0.8164780139923096s.
+[triton-dejavu] First execution including JIT compilation took 0.6937565803527832s.
+[triton-dejavu] First execution including JIT compilation took 0.49874210357666016s.
+[triton-dejavu] First execution including JIT compilation took 1.0514369010925293s.
+[triton-dejavu] First execution including JIT compilation took 0.7419230937957764s.
+[triton-dejavu] First execution including JIT compilation took 0.5654633045196533s.
+[triton-dejavu] First execution including JIT compilation took 1.0287201404571533s.
+[triton-dejavu] First execution including JIT compilation took 0.7904648780822754s.
+[triton-dejavu] First execution including JIT compilation took 0.5895709991455078s.
+[triton-dejavu] First execution including JIT compilation took 1.1943697929382324s.
+[triton-dejavu] First execution including JIT compilation took 0.8456614017486572s.
+[triton-dejavu] First execution including JIT compilation took 0.6484768390655518s.
+[triton-dejavu] First execution including JIT compilation took 1.2749860286712646s.
+[triton-dejavu] First execution including JIT compilation took 0.906667947769165s.
+[triton-dejavu] First execution including JIT compilation took 0.6650793552398682s.
+bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.2316875457763672s.
+[triton-dejavu] First execution including JIT compilation took 0.17262887954711914s.
+[triton-dejavu] First execution including JIT compilation took 0.16709303855895996s.
+[triton-dejavu] First execution including JIT compilation took 0.2316112518310547s.
+[triton-dejavu] First execution including JIT compilation took 0.19475674629211426s.
+[triton-dejavu] First execution including JIT compilation took 0.2070786952972412s.
+[triton-dejavu] First execution including JIT compilation took 0.26039981842041016s.
+[triton-dejavu] First execution including JIT compilation took 0.20882773399353027s.
+[triton-dejavu] First execution including JIT compilation took 0.20374321937561035s.
+[triton-dejavu] First execution including JIT compilation took 0.27775073051452637s.
+[triton-dejavu] First execution including JIT compilation took 0.2467350959777832s.
+[triton-dejavu] First execution including JIT compilation took 0.21544861793518066s.
+[triton-dejavu] First execution including JIT compilation took 0.32980918884277344s.
+[triton-dejavu] First execution including JIT compilation took 0.21561956405639648s.
+[triton-dejavu] First execution including JIT compilation took 0.2177584171295166s.
+[triton-dejavu] First execution including JIT compilation took 0.32271885871887207s.
+[triton-dejavu] First execution including JIT compilation took 0.24530315399169922s.
+[triton-dejavu] First execution including JIT compilation took 0.2153329849243164s.
+[triton-dejavu] First execution including JIT compilation took 0.3502006530761719s.
+[triton-dejavu] First execution including JIT compilation took 0.28928208351135254s.
+[triton-dejavu] First execution including JIT compilation took 0.22752094268798828s.
+[triton-dejavu] First execution including JIT compilation took 0.22658753395080566s.
+[triton-dejavu] First execution including JIT compilation took 0.19278335571289062s.
+[triton-dejavu] First execution including JIT compilation took 0.18082761764526367s.
+[triton-dejavu] First execution including JIT compilation took 0.2848987579345703s.
+[triton-dejavu] First execution including JIT compilation took 0.23020219802856445s.
+[triton-dejavu] First execution including JIT compilation took 0.18162894248962402s.
+[triton-dejavu] First execution including JIT compilation took 0.37184619903564453s.
+[triton-dejavu] First execution including JIT compilation took 0.29797792434692383s.
+[triton-dejavu] First execution including JIT compilation took 0.28612470626831055s.
+[triton-dejavu] First execution including JIT compilation took 0.4051649570465088s.
+[triton-dejavu] First execution including JIT compilation took 0.32303476333618164s.
+[triton-dejavu] First execution including JIT compilation took 0.2697916030883789s.
+[triton-dejavu] First execution including JIT compilation took 0.4405784606933594s.
+[triton-dejavu] First execution including JIT compilation took 0.34795689582824707s.
+[triton-dejavu] First execution including JIT compilation took 0.2898232936859131s.
+[triton-dejavu] First execution including JIT compilation took 0.4761343002319336s.
+[triton-dejavu] First execution including JIT compilation took 0.36168718338012695s.
+[triton-dejavu] First execution including JIT compilation took 0.28768467903137207s.
+[triton-dejavu] First execution including JIT compilation took 0.5420176982879639s.
+[triton-dejavu] First execution including JIT compilation took 0.38568615913391113s.
+[triton-dejavu] First execution including JIT compilation took 0.3170638084411621s.
+[triton-dejavu] First execution including JIT compilation took 0.35248708724975586s.
+[triton-dejavu] First execution including JIT compilation took 0.273029088973999s.
+[triton-dejavu] First execution including JIT compilation took 0.23226165771484375s.
+[triton-dejavu] First execution including JIT compilation took 0.41886162757873535s.
+[triton-dejavu] First execution including JIT compilation took 0.3393113613128662s.
+[triton-dejavu] First execution including JIT compilation took 0.26583361625671387s.
+[triton-dejavu] First execution including JIT compilation took 0.46086597442626953s.
+[triton-dejavu] First execution including JIT compilation took 0.3681511878967285s.
+[triton-dejavu] First execution including JIT compilation took 0.28913354873657227s.
+[triton-dejavu] First execution including JIT compilation took 0.49338269233703613s.
+[triton-dejavu] First execution including JIT compilation took 0.39551806449890137s.
+[triton-dejavu] First execution including JIT compilation took 0.3077273368835449s.
+[triton-dejavu] First execution including JIT compilation took 0.5245680809020996s.
+[triton-dejavu] First execution including JIT compilation took 0.4535055160522461s.
+[triton-dejavu] First execution including JIT compilation took 0.32816529273986816s.
+[triton-dejavu] First execution including JIT compilation took 0.563164234161377s.
+[triton-dejavu] First execution including JIT compilation took 0.4629950523376465s.
+[triton-dejavu] First execution including JIT compilation took 0.34805798530578613s.
+[triton-dejavu] First execution including JIT compilation took 0.620722770690918s.
+[triton-dejavu] First execution including JIT compilation took 0.5199508666992188s.
+[triton-dejavu] First execution including JIT compilation took 0.38794612884521484s.
+[triton-dejavu] First execution including JIT compilation took 0.5147454738616943s.
+[triton-dejavu] First execution including JIT compilation took 0.3552286624908447s.
+[triton-dejavu] First execution including JIT compilation took 0.28772640228271484s.
+[triton-dejavu] First execution including JIT compilation took 0.6648948192596436s.
+[triton-dejavu] First execution including JIT compilation took 0.47719812393188477s.
+[triton-dejavu] First execution including JIT compilation took 0.34389352798461914s.
+[triton-dejavu] First execution including JIT compilation took 0.767352819442749s.
+[triton-dejavu] First execution including JIT compilation took 0.5330626964569092s.
+[triton-dejavu] First execution including JIT compilation took 0.37920188903808594s.
+[triton-dejavu] First execution including JIT compilation took 0.7848920822143555s.
+[triton-dejavu] First execution including JIT compilation took 0.47530531883239746s.
+[triton-dejavu] First execution including JIT compilation took 0.33605313301086426s.
+[triton-dejavu] First execution including JIT compilation took 0.7004132270812988s.
+[triton-dejavu] First execution including JIT compilation took 0.4657857418060303s.
+[triton-dejavu] First execution including JIT compilation took 0.3541529178619385s.
+[triton-dejavu] First execution including JIT compilation took 0.7426049709320068s.
+[triton-dejavu] First execution including JIT compilation took 0.538907527923584s.
+[triton-dejavu] First execution including JIT compilation took 0.3655426502227783s.
+[triton-dejavu] First execution including JIT compilation took 0.8675262928009033s.
+[triton-dejavu] First execution including JIT compilation took 0.5515866279602051s.
+[triton-dejavu] First execution including JIT compilation took 0.4889793395996094s.
+[triton-dejavu] First execution including JIT compilation took 0.9458072185516357s.
+[triton-dejavu] First execution including JIT compilation took 0.5496277809143066s.
+[triton-dejavu] First execution including JIT compilation took 0.3970763683319092s.
+[triton-dejavu] First execution including JIT compilation took 1.425358772277832s.
+[triton-dejavu] First execution including JIT compilation took 0.8153319358825684s.
+[triton-dejavu] First execution including JIT compilation took 0.6550781726837158s.
+[triton-dejavu] First execution including JIT compilation took 2.0274274349212646s.
+[triton-dejavu] First execution including JIT compilation took 0.9665567874908447s.
+[triton-dejavu] First execution including JIT compilation took 0.7323272228240967s.
+[triton-dejavu] First execution including JIT compilation took 2.0035274028778076s.
+[triton-dejavu] First execution including JIT compilation took 0.794562578201294s.
+[triton-dejavu] First execution including JIT compilation took 0.5924549102783203s.
+[triton-dejavu] First execution including JIT compilation took 1.8792035579681396s.
+[triton-dejavu] First execution including JIT compilation took 0.8279719352722168s.
+[triton-dejavu] First execution including JIT compilation took 0.5921733379364014s.
+bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.25391125679016113s.
+[triton-dejavu] First execution including JIT compilation took 0.20267081260681152s.
+[triton-dejavu] First execution including JIT compilation took 0.16582441329956055s.
+[triton-dejavu] First execution including JIT compilation took 0.2948935031890869s.
+[triton-dejavu] First execution including JIT compilation took 0.22772979736328125s.
+[triton-dejavu] First execution including JIT compilation took 0.18806099891662598s.
+[triton-dejavu] First execution including JIT compilation took 0.3108959197998047s.
+[triton-dejavu] First execution including JIT compilation took 0.344806432723999s.
+[triton-dejavu] First execution including JIT compilation took 0.20200371742248535s.
+[triton-dejavu] First execution including JIT compilation took 0.3476550579071045s.
+[triton-dejavu] First execution including JIT compilation took 0.2549312114715576s.
+[triton-dejavu] First execution including JIT compilation took 0.22494268417358398s.
+[triton-dejavu] First execution including JIT compilation took 0.37220120429992676s.
+[triton-dejavu] First execution including JIT compilation took 0.2809004783630371s.
+[triton-dejavu] First execution including JIT compilation took 0.22059869766235352s.
+[triton-dejavu] First execution including JIT compilation took 0.388761043548584s.
+[triton-dejavu] First execution including JIT compilation took 0.3077573776245117s.
+[triton-dejavu] First execution including JIT compilation took 0.28969645500183105s.
+[triton-dejavu] First execution including JIT compilation took 0.4982035160064697s.
+[triton-dejavu] First execution including JIT compilation took 0.39230942726135254s.
+[triton-dejavu] First execution including JIT compilation took 0.2644228935241699s.
+[triton-dejavu] First execution including JIT compilation took 0.3611593246459961s.
+[triton-dejavu] First execution including JIT compilation took 0.2406003475189209s.
+[triton-dejavu] First execution including JIT compilation took 0.20929169654846191s.
+[triton-dejavu] First execution including JIT compilation took 0.4092109203338623s.
+[triton-dejavu] First execution including JIT compilation took 0.2963707447052002s.
+[triton-dejavu] First execution including JIT compilation took 0.2378685474395752s.
+[triton-dejavu] First execution including JIT compilation took 0.45641469955444336s.
+[triton-dejavu] First execution including JIT compilation took 0.32480573654174805s.
+[triton-dejavu] First execution including JIT compilation took 0.2426598072052002s.
+[triton-dejavu] First execution including JIT compilation took 0.5122194290161133s.
+[triton-dejavu] First execution including JIT compilation took 0.3030378818511963s.
+[triton-dejavu] First execution including JIT compilation took 0.24106526374816895s.
+[triton-dejavu] First execution including JIT compilation took 0.4959719181060791s.
+[triton-dejavu] First execution including JIT compilation took 0.4293406009674072s.
+[triton-dejavu] First execution including JIT compilation took 0.32636475563049316s.
+[triton-dejavu] First execution including JIT compilation took 0.7202773094177246s.
+[triton-dejavu] First execution including JIT compilation took 0.4574899673461914s.
+[triton-dejavu] First execution including JIT compilation took 0.3512580394744873s.
+[triton-dejavu] First execution including JIT compilation took 0.8339159488677979s.
+[triton-dejavu] First execution including JIT compilation took 0.5235207080841064s.
+[triton-dejavu] First execution including JIT compilation took 0.3777124881744385s.
+[triton-dejavu] First execution including JIT compilation took 0.5410404205322266s.
+[triton-dejavu] First execution including JIT compilation took 0.37018918991088867s.
+[triton-dejavu] First execution including JIT compilation took 0.27248096466064453s.
+[triton-dejavu] First execution including JIT compilation took 0.6164650917053223s.
+[triton-dejavu] First execution including JIT compilation took 0.4583768844604492s.
+[triton-dejavu] First execution including JIT compilation took 0.32603883743286133s.
+[triton-dejavu] First execution including JIT compilation took 0.7062644958496094s.
+[triton-dejavu] First execution including JIT compilation took 0.515678882598877s.
+[triton-dejavu] First execution including JIT compilation took 0.36606812477111816s.
+[triton-dejavu] First execution including JIT compilation took 0.7721257209777832s.
+[triton-dejavu] First execution including JIT compilation took 0.5739889144897461s.
+[triton-dejavu] First execution including JIT compilation took 0.4048495292663574s.
+[triton-dejavu] First execution including JIT compilation took 0.8282396793365479s.
+[triton-dejavu] First execution including JIT compilation took 0.5025653839111328s.
+[triton-dejavu] First execution including JIT compilation took 0.33838868141174316s.
+[triton-dejavu] First execution including JIT compilation took 0.7806441783905029s.
+[triton-dejavu] First execution including JIT compilation took 0.6090381145477295s.
+[triton-dejavu] First execution including JIT compilation took 0.3753626346588135s.
+[triton-dejavu] First execution including JIT compilation took 0.7488856315612793s.
+[triton-dejavu] First execution including JIT compilation took 0.7003397941589355s.
+[triton-dejavu] First execution including JIT compilation took 0.41066956520080566s.
+[triton-dejavu] First execution including JIT compilation took 0.7540671825408936s.
+[triton-dejavu] First execution including JIT compilation took 0.4108397960662842s.
+[triton-dejavu] First execution including JIT compilation took 0.28084588050842285s.
+[triton-dejavu] First execution including JIT compilation took 0.8834891319274902s.
+[triton-dejavu] First execution including JIT compilation took 0.49424219131469727s.
+[triton-dejavu] First execution including JIT compilation took 0.39174604415893555s.
+[triton-dejavu] First execution including JIT compilation took 1.3143653869628906s.
+[triton-dejavu] First execution including JIT compilation took 0.646043062210083s.
+[triton-dejavu] First execution including JIT compilation took 0.626563549041748s.
+[triton-dejavu] First execution including JIT compilation took 1.4293315410614014s.
+[triton-dejavu] First execution including JIT compilation took 0.6236376762390137s.
+[triton-dejavu] First execution including JIT compilation took 0.48520755767822266s.
+[triton-dejavu] First execution including JIT compilation took 1.6379265785217285s.
+[triton-dejavu] First execution including JIT compilation took 0.7366843223571777s.
+[triton-dejavu] First execution including JIT compilation took 0.4963555335998535s.
+[triton-dejavu] First execution including JIT compilation took 1.574018955230713s.
+[triton-dejavu] First execution including JIT compilation took 0.7276029586791992s.
+bench_cudagraph failed with out of resource: shared memory, Required: 245248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 330240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.557365894317627s.
+[triton-dejavu] First execution including JIT compilation took 0.6995244026184082s.
+[triton-dejavu] First execution including JIT compilation took 0.42113590240478516s.
+[triton-dejavu] First execution including JIT compilation took 2.4745399951934814s.
+[triton-dejavu] First execution including JIT compilation took 0.9797840118408203s.
+[triton-dejavu] First execution including JIT compilation took 0.6241872310638428s.
+[triton-dejavu] First execution including JIT compilation took 6.13117790222168s.
+[triton-dejavu] First execution including JIT compilation took 1.4725189208984375s.
+[triton-dejavu] First execution including JIT compilation took 0.681943416595459s.
+bench_cudagraph failed with out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.3794093132019043s.
+[triton-dejavu] First execution including JIT compilation took 0.28264904022216797s.
+[triton-dejavu] First execution including JIT compilation took 0.22452759742736816s.
+[triton-dejavu] First execution including JIT compilation took 0.559002161026001s.
+[triton-dejavu] First execution including JIT compilation took 0.3224790096282959s.
+[triton-dejavu] First execution including JIT compilation took 0.28389406204223633s.
+[triton-dejavu] First execution including JIT compilation took 0.5182971954345703s.
+[triton-dejavu] First execution including JIT compilation took 0.345487117767334s.
+[triton-dejavu] First execution including JIT compilation took 0.2435622215270996s.
+[triton-dejavu] First execution including JIT compilation took 0.5336036682128906s.
+[triton-dejavu] First execution including JIT compilation took 0.3894071578979492s.
+[triton-dejavu] First execution including JIT compilation took 0.3008308410644531s.
+[triton-dejavu] First execution including JIT compilation took 0.7498984336853027s.
+[triton-dejavu] First execution including JIT compilation took 0.41705965995788574s.
+[triton-dejavu] First execution including JIT compilation took 0.2856142520904541s.
+[triton-dejavu] First execution including JIT compilation took 0.7986507415771484s.
+[triton-dejavu] First execution including JIT compilation took 0.506192684173584s.
+[triton-dejavu] First execution including JIT compilation took 0.35767054557800293s.
+[triton-dejavu] First execution including JIT compilation took 0.9271838665008545s.
+[triton-dejavu] First execution including JIT compilation took 0.5614745616912842s.
+[triton-dejavu] First execution including JIT compilation took 0.39832496643066406s.
+[triton-dejavu] First execution including JIT compilation took 0.6550092697143555s.
+[triton-dejavu] First execution including JIT compilation took 0.4102933406829834s.
+[triton-dejavu] First execution including JIT compilation took 0.28809452056884766s.
+[triton-dejavu] First execution including JIT compilation took 0.8442857265472412s.
+[triton-dejavu] First execution including JIT compilation took 0.49399375915527344s.
+[triton-dejavu] First execution including JIT compilation took 0.3414480686187744s.
+[triton-dejavu] First execution including JIT compilation took 0.9948995113372803s.
+[triton-dejavu] First execution including JIT compilation took 0.544846773147583s.
+[triton-dejavu] First execution including JIT compilation took 0.36998677253723145s.
+[triton-dejavu] First execution including JIT compilation took 1.1347663402557373s.
+[triton-dejavu] First execution including JIT compilation took 0.5956213474273682s.
+[triton-dejavu] First execution including JIT compilation took 0.41924381256103516s.
+[triton-dejavu] First execution including JIT compilation took 1.2498164176940918s.
+[triton-dejavu] First execution including JIT compilation took 0.6886944770812988s.
+[triton-dejavu] First execution including JIT compilation took 0.45352959632873535s.
+[triton-dejavu] First execution including JIT compilation took 1.3488807678222656s.
+[triton-dejavu] First execution including JIT compilation took 0.7345826625823975s.
+[triton-dejavu] First execution including JIT compilation took 0.4611852169036865s.
+[triton-dejavu] First execution including JIT compilation took 1.6846129894256592s.
+[triton-dejavu] First execution including JIT compilation took 0.8527877330780029s.
+[triton-dejavu] First execution including JIT compilation took 0.519357442855835s.
+[triton-dejavu] First execution including JIT compilation took 0.9926292896270752s.
+[triton-dejavu] First execution including JIT compilation took 0.5671131610870361s.
+[triton-dejavu] First execution including JIT compilation took 0.36908459663391113s.
+[triton-dejavu] First execution including JIT compilation took 1.1392111778259277s.
+[triton-dejavu] First execution including JIT compilation took 0.7338624000549316s.
+[triton-dejavu] First execution including JIT compilation took 0.37808799743652344s.
+[triton-dejavu] First execution including JIT compilation took 1.32969069480896s.
+[triton-dejavu] First execution including JIT compilation took 0.7195644378662109s.
+[triton-dejavu] First execution including JIT compilation took 0.43347692489624023s.
+[triton-dejavu] First execution including JIT compilation took 1.5576729774475098s.
+[triton-dejavu] First execution including JIT compilation took 0.780888557434082s.
+[triton-dejavu] First execution including JIT compilation took 0.5686335563659668s.
+[triton-dejavu] First execution including JIT compilation took 1.5757191181182861s.
+[triton-dejavu] First execution including JIT compilation took 1.1339452266693115s.
+[triton-dejavu] First execution including JIT compilation took 0.6171472072601318s.
+[triton-dejavu] First execution including JIT compilation took 1.9367270469665527s.
+[triton-dejavu] First execution including JIT compilation took 1.2703828811645508s.
+[triton-dejavu] First execution including JIT compilation took 0.6219308376312256s.
+bench_cudagraph failed with out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 296192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 296192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.229659080505371s.
+[triton-dejavu] First execution including JIT compilation took 0.6827125549316406s.
+[triton-dejavu] First execution including JIT compilation took 0.4279639720916748s.
+[triton-dejavu] First execution including JIT compilation took 2.214158535003662s.
+[triton-dejavu] First execution including JIT compilation took 0.847602367401123s.
+[triton-dejavu] First execution including JIT compilation took 0.5684854984283447s.
+[triton-dejavu] First execution including JIT compilation took 5.671643257141113s.
+[triton-dejavu] First execution including JIT compilation took 1.3386998176574707s.
+[triton-dejavu] First execution including JIT compilation took 0.7006118297576904s.
+[triton-dejavu] First execution including JIT compilation took 6.009850025177002s.
+[triton-dejavu] First execution including JIT compilation took 1.425264596939087s.
+bench_cudagraph failed with out of resource: shared memory, Required: 291328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 291328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 441856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 441856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 592384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 592384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 4.9762208461761475s.
+[triton-dejavu] First execution including JIT compilation took 1.5692176818847656s.
+[triton-dejavu] First execution including JIT compilation took 0.7641324996948242s.
+[triton-dejavu] First execution including JIT compilation took 6.608908176422119s.
+[triton-dejavu] First execution including JIT compilation took 2.132209062576294s.
+[triton-dejavu] First execution including JIT compilation took 0.9537761211395264s.
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.1910562515258789s.
+[triton-dejavu] First execution including JIT compilation took 0.17246103286743164s.
+[triton-dejavu] First execution including JIT compilation took 0.17216134071350098s.
+[triton-dejavu] First execution including JIT compilation took 0.2674295902252197s.
+[triton-dejavu] First execution including JIT compilation took 0.2252979278564453s.
+[triton-dejavu] First execution including JIT compilation took 0.19235920906066895s.
+[triton-dejavu] First execution including JIT compilation took 0.21264386177062988s.
+[triton-dejavu] First execution including JIT compilation took 0.2161550521850586s.
+[triton-dejavu] First execution including JIT compilation took 0.1922304630279541s.
+[triton-dejavu] First execution including JIT compilation took 0.25547218322753906s.
+[triton-dejavu] First execution including JIT compilation took 0.27811694145202637s.
+[triton-dejavu] First execution including JIT compilation took 0.23806548118591309s.
+[triton-dejavu] First execution including JIT compilation took 0.3011937141418457s.
+[triton-dejavu] First execution including JIT compilation took 0.2934706211090088s.
+[triton-dejavu] First execution including JIT compilation took 0.25322985649108887s.
+[triton-dejavu] First execution including JIT compilation took 0.3128054141998291s.
+[triton-dejavu] First execution including JIT compilation took 0.28246212005615234s.
+[triton-dejavu] First execution including JIT compilation took 0.2546539306640625s.
+[triton-dejavu] First execution including JIT compilation took 0.33473944664001465s.
+[triton-dejavu] First execution including JIT compilation took 0.30516982078552246s.
+[triton-dejavu] First execution including JIT compilation took 0.27489733695983887s.
+[triton-dejavu] First execution including JIT compilation took 0.24262332916259766s.
+[triton-dejavu] First execution including JIT compilation took 0.2100353240966797s.
+[triton-dejavu] First execution including JIT compilation took 0.19793057441711426s.
+[triton-dejavu] First execution including JIT compilation took 0.2780025005340576s.
+[triton-dejavu] First execution including JIT compilation took 0.24424457550048828s.
+[triton-dejavu] First execution including JIT compilation took 0.231339693069458s.
+[triton-dejavu] First execution including JIT compilation took 0.29887890815734863s.
+[triton-dejavu] First execution including JIT compilation took 0.2637321949005127s.
+[triton-dejavu] First execution including JIT compilation took 0.24405384063720703s.
+[triton-dejavu] First execution including JIT compilation took 0.32925963401794434s.
+[triton-dejavu] First execution including JIT compilation took 0.28090524673461914s.
+[triton-dejavu] First execution including JIT compilation took 0.2658822536468506s.
+[triton-dejavu] First execution including JIT compilation took 0.34981393814086914s.
+[triton-dejavu] First execution including JIT compilation took 0.2969369888305664s.
+[triton-dejavu] First execution including JIT compilation took 0.273942232131958s.
+[triton-dejavu] First execution including JIT compilation took 0.37868213653564453s.
+[triton-dejavu] First execution including JIT compilation took 0.33127617835998535s.
+[triton-dejavu] First execution including JIT compilation took 0.3416633605957031s.
+[triton-dejavu] First execution including JIT compilation took 0.41475677490234375s.
+[triton-dejavu] First execution including JIT compilation took 0.33086156845092773s.
+[triton-dejavu] First execution including JIT compilation took 0.3177492618560791s.
+[triton-dejavu] First execution including JIT compilation took 0.3063650131225586s.
+[triton-dejavu] First execution including JIT compilation took 0.23031854629516602s.
+[triton-dejavu] First execution including JIT compilation took 0.21300745010375977s.
+[triton-dejavu] First execution including JIT compilation took 0.38768625259399414s.
+[triton-dejavu] First execution including JIT compilation took 0.2662017345428467s.
+[triton-dejavu] First execution including JIT compilation took 0.24376845359802246s.
+[triton-dejavu] First execution including JIT compilation took 0.42224621772766113s.
+[triton-dejavu] First execution including JIT compilation took 0.28191328048706055s.
+[triton-dejavu] First execution including JIT compilation took 0.273775577545166s.
+[triton-dejavu] First execution including JIT compilation took 0.4455993175506592s.
+[triton-dejavu] First execution including JIT compilation took 0.3689110279083252s.
+[triton-dejavu] First execution including JIT compilation took 0.26688480377197266s.
+[triton-dejavu] First execution including JIT compilation took 0.4688987731933594s.
+[triton-dejavu] First execution including JIT compilation took 0.31668877601623535s.
+[triton-dejavu] First execution including JIT compilation took 0.2852771282196045s.
+[triton-dejavu] First execution including JIT compilation took 0.5058488845825195s.
+[triton-dejavu] First execution including JIT compilation took 0.33969998359680176s.
+[triton-dejavu] First execution including JIT compilation took 0.3043205738067627s.
+[triton-dejavu] First execution including JIT compilation took 0.5594408512115479s.
+[triton-dejavu] First execution including JIT compilation took 0.38538432121276855s.
+[triton-dejavu] First execution including JIT compilation took 0.40354394912719727s.
+[triton-dejavu] First execution including JIT compilation took 0.4203341007232666s.
+[triton-dejavu] First execution including JIT compilation took 0.2790985107421875s.
+[triton-dejavu] First execution including JIT compilation took 0.197509765625s.
+[triton-dejavu] First execution including JIT compilation took 0.5050961971282959s.
+[triton-dejavu] First execution including JIT compilation took 0.2615811824798584s.
+[triton-dejavu] First execution including JIT compilation took 0.23754334449768066s.
+[triton-dejavu] First execution including JIT compilation took 0.5479357242584229s.
+[triton-dejavu] First execution including JIT compilation took 0.29597973823547363s.
+[triton-dejavu] First execution including JIT compilation took 0.22592473030090332s.
+[triton-dejavu] First execution including JIT compilation took 0.5904271602630615s.
+[triton-dejavu] First execution including JIT compilation took 0.3177652359008789s.
+[triton-dejavu] First execution including JIT compilation took 0.23325729370117188s.
+[triton-dejavu] First execution including JIT compilation took 0.6337690353393555s.
+[triton-dejavu] First execution including JIT compilation took 0.3158242702484131s.
+[triton-dejavu] First execution including JIT compilation took 0.26456284523010254s.
+[triton-dejavu] First execution including JIT compilation took 0.6728482246398926s.
+[triton-dejavu] First execution including JIT compilation took 0.3370821475982666s.
+[triton-dejavu] First execution including JIT compilation took 0.27890753746032715s.
+[triton-dejavu] First execution including JIT compilation took 0.7555828094482422s.
+[triton-dejavu] First execution including JIT compilation took 0.47994327545166016s.
+[triton-dejavu] First execution including JIT compilation took 0.3138282299041748s.
+[triton-dejavu] First execution including JIT compilation took 0.6586263179779053s.
+[triton-dejavu] First execution including JIT compilation took 0.2855665683746338s.
+[triton-dejavu] First execution including JIT compilation took 0.21575546264648438s.
+[triton-dejavu] First execution including JIT compilation took 0.8698668479919434s.
+[triton-dejavu] First execution including JIT compilation took 0.326815128326416s.
+[triton-dejavu] First execution including JIT compilation took 0.24704337120056152s.
+[triton-dejavu] First execution including JIT compilation took 1.3291542530059814s.
+[triton-dejavu] First execution including JIT compilation took 0.41158032417297363s.
+[triton-dejavu] First execution including JIT compilation took 0.2945075035095215s.
+[triton-dejavu] First execution including JIT compilation took 1.4427604675292969s.
+[triton-dejavu] First execution including JIT compilation took 0.4566466808319092s.
+[triton-dejavu] First execution including JIT compilation took 0.35230016708374023s.
+[triton-dejavu] First execution including JIT compilation took 1.5283832550048828s.
+[triton-dejavu] First execution including JIT compilation took 0.822779655456543s.
+[triton-dejavu] First execution including JIT compilation took 0.400043249130249s.
+[triton-dejavu] First execution including JIT compilation took 1.59427809715271s.
+bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.19841480255126953s.
+[triton-dejavu] First execution including JIT compilation took 0.19194793701171875s.
+[triton-dejavu] First execution including JIT compilation took 0.1582164764404297s.
+[triton-dejavu] First execution including JIT compilation took 0.24194097518920898s.
+[triton-dejavu] First execution including JIT compilation took 0.21207785606384277s.
+[triton-dejavu] First execution including JIT compilation took 0.19394159317016602s.
+[triton-dejavu] First execution including JIT compilation took 0.24550700187683105s.
+[triton-dejavu] First execution including JIT compilation took 0.21821212768554688s.
+[triton-dejavu] First execution including JIT compilation took 0.18725252151489258s.
+[triton-dejavu] First execution including JIT compilation took 0.26776623725891113s.
+[triton-dejavu] First execution including JIT compilation took 0.20471405982971191s.
+[triton-dejavu] First execution including JIT compilation took 0.20141196250915527s.
+[triton-dejavu] First execution including JIT compilation took 0.26976442337036133s.
+[triton-dejavu] First execution including JIT compilation took 0.24028730392456055s.
+[triton-dejavu] First execution including JIT compilation took 0.23756718635559082s.
+[triton-dejavu] First execution including JIT compilation took 0.2599141597747803s.
+[triton-dejavu] First execution including JIT compilation took 0.23916363716125488s.
+[triton-dejavu] First execution including JIT compilation took 0.21816468238830566s.
+[triton-dejavu] First execution including JIT compilation took 0.28762292861938477s.
+[triton-dejavu] First execution including JIT compilation took 0.2479848861694336s.
+[triton-dejavu] First execution including JIT compilation took 0.25420284271240234s.
+[triton-dejavu] First execution including JIT compilation took 0.2362511157989502s.
+[triton-dejavu] First execution including JIT compilation took 0.18312764167785645s.
+[triton-dejavu] First execution including JIT compilation took 0.17608380317687988s.
+[triton-dejavu] First execution including JIT compilation took 0.2786374092102051s.
+[triton-dejavu] First execution including JIT compilation took 0.21152758598327637s.
+[triton-dejavu] First execution including JIT compilation took 0.20641374588012695s.
+[triton-dejavu] First execution including JIT compilation took 0.30803728103637695s.
+[triton-dejavu] First execution including JIT compilation took 0.23598504066467285s.
+[triton-dejavu] First execution including JIT compilation took 0.2227318286895752s.
+[triton-dejavu] First execution including JIT compilation took 0.3432927131652832s.
+[triton-dejavu] First execution including JIT compilation took 0.22769927978515625s.
+[triton-dejavu] First execution including JIT compilation took 0.20647501945495605s.
+[triton-dejavu] First execution including JIT compilation took 0.3485453128814697s.
+[triton-dejavu] First execution including JIT compilation took 0.2762014865875244s.
+[triton-dejavu] First execution including JIT compilation took 0.21726274490356445s.
+[triton-dejavu] First execution including JIT compilation took 0.32701706886291504s.
+[triton-dejavu] First execution including JIT compilation took 0.24490046501159668s.
+[triton-dejavu] First execution including JIT compilation took 0.2208249568939209s.
+[triton-dejavu] First execution including JIT compilation took 0.36136794090270996s.
+[triton-dejavu] First execution including JIT compilation took 0.3137195110321045s.
+[triton-dejavu] First execution including JIT compilation took 0.26834893226623535s.
+[triton-dejavu] First execution including JIT compilation took 0.32502317428588867s.
+[triton-dejavu] First execution including JIT compilation took 0.21649813652038574s.
+[triton-dejavu] First execution including JIT compilation took 0.18822789192199707s.
+[triton-dejavu] First execution including JIT compilation took 0.34781932830810547s.
+[triton-dejavu] First execution including JIT compilation took 0.25492358207702637s.
+[triton-dejavu] First execution including JIT compilation took 0.21149992942810059s.
+[triton-dejavu] First execution including JIT compilation took 0.41837024688720703s.
+[triton-dejavu] First execution including JIT compilation took 0.2709987163543701s.
+[triton-dejavu] First execution including JIT compilation took 0.22152233123779297s.
+[triton-dejavu] First execution including JIT compilation took 0.46758460998535156s.
+[triton-dejavu] First execution including JIT compilation took 0.2976984977722168s.
+[triton-dejavu] First execution including JIT compilation took 0.2336409091949463s.
+[triton-dejavu] First execution including JIT compilation took 0.42842841148376465s.
+[triton-dejavu] First execution including JIT compilation took 0.30059170722961426s.
+[triton-dejavu] First execution including JIT compilation took 0.25075721740722656s.
+[triton-dejavu] First execution including JIT compilation took 0.4862644672393799s.
+[triton-dejavu] First execution including JIT compilation took 0.32674360275268555s.
+[triton-dejavu] First execution including JIT compilation took 0.3176698684692383s.
+[triton-dejavu] First execution including JIT compilation took 0.6764540672302246s.
+[triton-dejavu] First execution including JIT compilation took 0.4595639705657959s.
+[triton-dejavu] First execution including JIT compilation took 0.3412759304046631s.
+[triton-dejavu] First execution including JIT compilation took 0.5369167327880859s.
+[triton-dejavu] First execution including JIT compilation took 0.3099100589752197s.
+[triton-dejavu] First execution including JIT compilation took 0.2513244152069092s.
+[triton-dejavu] First execution including JIT compilation took 0.683905839920044s.
+[triton-dejavu] First execution including JIT compilation took 0.3577401638031006s.
+[triton-dejavu] First execution including JIT compilation took 0.29708075523376465s.
+[triton-dejavu] First execution including JIT compilation took 0.8124041557312012s.
+[triton-dejavu] First execution including JIT compilation took 0.3909914493560791s.
+[triton-dejavu] First execution including JIT compilation took 0.32225966453552246s.
+[triton-dejavu] First execution including JIT compilation took 0.875910758972168s.
+[triton-dejavu] First execution including JIT compilation took 0.4234771728515625s.
+[triton-dejavu] First execution including JIT compilation took 0.3409273624420166s.
+[triton-dejavu] First execution including JIT compilation took 0.9268946647644043s.
+[triton-dejavu] First execution including JIT compilation took 0.46335840225219727s.
+[triton-dejavu] First execution including JIT compilation took 0.3612051010131836s.
+[triton-dejavu] First execution including JIT compilation took 0.9951462745666504s.
+[triton-dejavu] First execution including JIT compilation took 0.4895823001861572s.
+[triton-dejavu] First execution including JIT compilation took 0.38950228691101074s.
+[triton-dejavu] First execution including JIT compilation took 1.1150171756744385s.
+[triton-dejavu] First execution including JIT compilation took 0.5509529113769531s.
+[triton-dejavu] First execution including JIT compilation took 0.4379761219024658s.
+[triton-dejavu] First execution including JIT compilation took 0.9682984352111816s.
+[triton-dejavu] First execution including JIT compilation took 0.37152743339538574s.
+[triton-dejavu] First execution including JIT compilation took 0.25163698196411133s.
+[triton-dejavu] First execution including JIT compilation took 1.097111701965332s.
+[triton-dejavu] First execution including JIT compilation took 0.4002962112426758s.
+[triton-dejavu] First execution including JIT compilation took 0.29827260971069336s.
+[triton-dejavu] First execution including JIT compilation took 1.8650331497192383s.
+[triton-dejavu] First execution including JIT compilation took 0.5061264038085938s.
+[triton-dejavu] First execution including JIT compilation took 0.3558540344238281s.
+[triton-dejavu] First execution including JIT compilation took 1.9241220951080322s.
+[triton-dejavu] First execution including JIT compilation took 0.6082024574279785s.
+[triton-dejavu] First execution including JIT compilation took 0.35095739364624023s.
+[triton-dejavu] First execution including JIT compilation took 2.000699758529663s.
+bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.23331809043884277s.
+[triton-dejavu] First execution including JIT compilation took 0.2007770538330078s.
+[triton-dejavu] First execution including JIT compilation took 0.16995501518249512s.
+[triton-dejavu] First execution including JIT compilation took 0.2849550247192383s.
+[triton-dejavu] First execution including JIT compilation took 0.24909639358520508s.
+[triton-dejavu] First execution including JIT compilation took 0.20572757720947266s.
+[triton-dejavu] First execution including JIT compilation took 0.2925271987915039s.
+[triton-dejavu] First execution including JIT compilation took 0.24988293647766113s.
+[triton-dejavu] First execution including JIT compilation took 0.19756340980529785s.
+[triton-dejavu] First execution including JIT compilation took 0.308469295501709s.
+[triton-dejavu] First execution including JIT compilation took 0.25925660133361816s.
+[triton-dejavu] First execution including JIT compilation took 0.2018594741821289s.
+[triton-dejavu] First execution including JIT compilation took 0.31905412673950195s.
+[triton-dejavu] First execution including JIT compilation took 0.2588214874267578s.
+[triton-dejavu] First execution including JIT compilation took 0.22939348220825195s.
+[triton-dejavu] First execution including JIT compilation took 0.3315243721008301s.
+[triton-dejavu] First execution including JIT compilation took 0.2577550411224365s.
+[triton-dejavu] First execution including JIT compilation took 0.23265576362609863s.
+[triton-dejavu] First execution including JIT compilation took 0.3648536205291748s.
+[triton-dejavu] First execution including JIT compilation took 0.29656481742858887s.
+[triton-dejavu] First execution including JIT compilation took 0.24346494674682617s.
+[triton-dejavu] First execution including JIT compilation took 0.29159116744995117s.
+[triton-dejavu] First execution including JIT compilation took 0.20661616325378418s.
+[triton-dejavu] First execution including JIT compilation took 0.19298219680786133s.
+[triton-dejavu] First execution including JIT compilation took 0.3263542652130127s.
+[triton-dejavu] First execution including JIT compilation took 0.2544429302215576s.
+[triton-dejavu] First execution including JIT compilation took 0.24164104461669922s.
+[triton-dejavu] First execution including JIT compilation took 0.35983991622924805s.
+[triton-dejavu] First execution including JIT compilation took 0.27173733711242676s.
+[triton-dejavu] First execution including JIT compilation took 0.30269622802734375s.
+[triton-dejavu] First execution including JIT compilation took 0.3681807518005371s.
+[triton-dejavu] First execution including JIT compilation took 0.30908918380737305s.
+[triton-dejavu] First execution including JIT compilation took 0.21474623680114746s.
+[triton-dejavu] First execution including JIT compilation took 0.4122345447540283s.
+[triton-dejavu] First execution including JIT compilation took 0.29869675636291504s.
+[triton-dejavu] First execution including JIT compilation took 0.22951626777648926s.
+[triton-dejavu] First execution including JIT compilation took 0.4384334087371826s.
+[triton-dejavu] First execution including JIT compilation took 0.34481048583984375s.
+[triton-dejavu] First execution including JIT compilation took 0.23748016357421875s.
+[triton-dejavu] First execution including JIT compilation took 0.4472684860229492s.
+[triton-dejavu] First execution including JIT compilation took 0.3110086917877197s.
+[triton-dejavu] First execution including JIT compilation took 0.2900521755218506s.
+[triton-dejavu] First execution including JIT compilation took 0.3711414337158203s.
+[triton-dejavu] First execution including JIT compilation took 0.23607397079467773s.
+[triton-dejavu] First execution including JIT compilation took 0.264019250869751s.
+[triton-dejavu] First execution including JIT compilation took 0.7435603141784668s.
+[triton-dejavu] First execution including JIT compilation took 0.44277524948120117s.
+[triton-dejavu] First execution including JIT compilation took 0.21710801124572754s.
+[triton-dejavu] First execution including JIT compilation took 0.4168999195098877s.
+[triton-dejavu] First execution including JIT compilation took 0.3037564754486084s.
+[triton-dejavu] First execution including JIT compilation took 0.23413658142089844s.
+[triton-dejavu] First execution including JIT compilation took 0.5455996990203857s.
+[triton-dejavu] First execution including JIT compilation took 0.38571715354919434s.
+[triton-dejavu] First execution including JIT compilation took 0.31468629837036133s.
+[triton-dejavu] First execution including JIT compilation took 0.9226047992706299s.
+[triton-dejavu] First execution including JIT compilation took 0.5366237163543701s.
+[triton-dejavu] First execution including JIT compilation took 0.33862876892089844s.
+[triton-dejavu] First execution including JIT compilation took 0.7460176944732666s.
+[triton-dejavu] First execution including JIT compilation took 0.5355172157287598s.
+[triton-dejavu] First execution including JIT compilation took 0.3547065258026123s.
+[triton-dejavu] First execution including JIT compilation took 0.7944064140319824s.
+[triton-dejavu] First execution including JIT compilation took 0.5351183414459229s.
+[triton-dejavu] First execution including JIT compilation took 0.38912463188171387s.
+[triton-dejavu] First execution including JIT compilation took 0.6645946502685547s.
+[triton-dejavu] First execution including JIT compilation took 0.361285924911499s.
+[triton-dejavu] First execution including JIT compilation took 0.26433610916137695s.
+[triton-dejavu] First execution including JIT compilation took 0.7722549438476562s.
+[triton-dejavu] First execution including JIT compilation took 0.43912410736083984s.
+[triton-dejavu] First execution including JIT compilation took 0.34244585037231445s.
+[triton-dejavu] First execution including JIT compilation took 0.8954603672027588s.
+[triton-dejavu] First execution including JIT compilation took 0.4297215938568115s.
+[triton-dejavu] First execution including JIT compilation took 0.29612112045288086s.
+[triton-dejavu] First execution including JIT compilation took 0.8536269664764404s.
+[triton-dejavu] First execution including JIT compilation took 0.4335141181945801s.
+[triton-dejavu] First execution including JIT compilation took 0.3198516368865967s.
+[triton-dejavu] First execution including JIT compilation took 0.9171550273895264s.
+[triton-dejavu] First execution including JIT compilation took 0.49994635581970215s.
+[triton-dejavu] First execution including JIT compilation took 0.3632478713989258s.
+[triton-dejavu] First execution including JIT compilation took 0.9547479152679443s.
+[triton-dejavu] First execution including JIT compilation took 0.6412930488586426s.
+[triton-dejavu] First execution including JIT compilation took 0.38673996925354004s.
+bench_cudagraph failed with out of resource: shared memory, Required: 240128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 240128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 272896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 272896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 272896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 272896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.1171579360961914s.
+[triton-dejavu] First execution including JIT compilation took 0.4610905647277832s.
+[triton-dejavu] First execution including JIT compilation took 0.29224681854248047s.
+[triton-dejavu] First execution including JIT compilation took 1.3466551303863525s.
+[triton-dejavu] First execution including JIT compilation took 0.5738677978515625s.
+[triton-dejavu] First execution including JIT compilation took 0.39911484718322754s.
+[triton-dejavu] First execution including JIT compilation took 2.352712631225586s.
+[triton-dejavu] First execution including JIT compilation took 0.7405276298522949s.
+[triton-dejavu] First execution including JIT compilation took 0.3971683979034424s.
+[triton-dejavu] First execution including JIT compilation took 2.751913070678711s.
+bench_cudagraph failed with out of resource: shared memory, Required: 271360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 271360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 271360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 271360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 274432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 274432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 343040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 343040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 408576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 408576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 408576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 408576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 480256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 480256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 545792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 545792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 545792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 545792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.33484816551208496s.
+[triton-dejavu] First execution including JIT compilation took 0.2522711753845215s.
+[triton-dejavu] First execution including JIT compilation took 0.2160511016845703s.
+[triton-dejavu] First execution including JIT compilation took 0.39322805404663086s.
+[triton-dejavu] First execution including JIT compilation took 0.30945301055908203s.
+[triton-dejavu] First execution including JIT compilation took 0.23270702362060547s.
+[triton-dejavu] First execution including JIT compilation took 0.40409421920776367s.
+[triton-dejavu] First execution including JIT compilation took 0.29307126998901367s.
+[triton-dejavu] First execution including JIT compilation took 0.23789381980895996s.
+[triton-dejavu] First execution including JIT compilation took 0.4281790256500244s.
+[triton-dejavu] First execution including JIT compilation took 0.3268117904663086s.
+[triton-dejavu] First execution including JIT compilation took 0.23137664794921875s.
+[triton-dejavu] First execution including JIT compilation took 0.5715954303741455s.
+[triton-dejavu] First execution including JIT compilation took 0.34709930419921875s.
+[triton-dejavu] First execution including JIT compilation took 0.2881906032562256s.
+[triton-dejavu] First execution including JIT compilation took 0.5386977195739746s.
+[triton-dejavu] First execution including JIT compilation took 0.3279886245727539s.
+[triton-dejavu] First execution including JIT compilation took 0.2740349769592285s.
+[triton-dejavu] First execution including JIT compilation took 0.7362427711486816s.
+[triton-dejavu] First execution including JIT compilation took 0.3900623321533203s.
+[triton-dejavu] First execution including JIT compilation took 0.5564815998077393s.
+[triton-dejavu] First execution including JIT compilation took 0.43883204460144043s.
+[triton-dejavu] First execution including JIT compilation took 0.29198122024536133s.
+[triton-dejavu] First execution including JIT compilation took 0.22876620292663574s.
+[triton-dejavu] First execution including JIT compilation took 0.5440170764923096s.
+[triton-dejavu] First execution including JIT compilation took 0.31906890869140625s.
+[triton-dejavu] First execution including JIT compilation took 0.24542737007141113s.
+[triton-dejavu] First execution including JIT compilation took 0.5503432750701904s.
+[triton-dejavu] First execution including JIT compilation took 0.5911548137664795s.
+[triton-dejavu] First execution including JIT compilation took 0.2854585647583008s.
+[triton-dejavu] First execution including JIT compilation took 0.5509939193725586s.
+[triton-dejavu] First execution including JIT compilation took 0.35714101791381836s.
+[triton-dejavu] First execution including JIT compilation took 0.32631993293762207s.
+[triton-dejavu] First execution including JIT compilation took 0.7352540493011475s.
+[triton-dejavu] First execution including JIT compilation took 0.4919905662536621s.
+[triton-dejavu] First execution including JIT compilation took 0.3511998653411865s.
+[triton-dejavu] First execution including JIT compilation took 0.869816780090332s.
+[triton-dejavu] First execution including JIT compilation took 0.6908586025238037s.
+[triton-dejavu] First execution including JIT compilation took 0.3677358627319336s.
+[triton-dejavu] First execution including JIT compilation took 1.0730576515197754s.
+[triton-dejavu] First execution including JIT compilation took 0.5986707210540771s.
+[triton-dejavu] First execution including JIT compilation took 0.40021514892578125s.
+[triton-dejavu] First execution including JIT compilation took 0.7205333709716797s.
+[triton-dejavu] First execution including JIT compilation took 0.4558281898498535s.
+[triton-dejavu] First execution including JIT compilation took 0.3011903762817383s.
+[triton-dejavu] First execution including JIT compilation took 0.7565047740936279s.
+[triton-dejavu] First execution including JIT compilation took 0.5372674465179443s.
+[triton-dejavu] First execution including JIT compilation took 0.35286855697631836s.
+[triton-dejavu] First execution including JIT compilation took 0.778130292892456s.
+[triton-dejavu] First execution including JIT compilation took 0.48418164253234863s.
+[triton-dejavu] First execution including JIT compilation took 0.40369200706481934s.
+[triton-dejavu] First execution including JIT compilation took 0.8576066493988037s.
+[triton-dejavu] First execution including JIT compilation took 0.6475625038146973s.
+[triton-dejavu] First execution including JIT compilation took 0.3454623222351074s.
+[triton-dejavu] First execution including JIT compilation took 0.8133499622344971s.
+[triton-dejavu] First execution including JIT compilation took 0.6029653549194336s.
+[triton-dejavu] First execution including JIT compilation took 0.36894989013671875s.
+[triton-dejavu] First execution including JIT compilation took 0.8726181983947754s.
+[triton-dejavu] First execution including JIT compilation took 0.5887446403503418s.
+[triton-dejavu] First execution including JIT compilation took 0.38448596000671387s.
+[triton-dejavu] First execution including JIT compilation took 1.022134780883789s.
+[triton-dejavu] First execution including JIT compilation took 0.67586350440979s.
+[triton-dejavu] First execution including JIT compilation took 0.4078867435455322s.
+[triton-dejavu] First execution including JIT compilation took 1.0992224216461182s.
+[triton-dejavu] First execution including JIT compilation took 0.5207531452178955s.
+[triton-dejavu] First execution including JIT compilation took 0.303997278213501s.
+[triton-dejavu] First execution including JIT compilation took 1.1604199409484863s.
+[triton-dejavu] First execution including JIT compilation took 0.5801262855529785s.
+[triton-dejavu] First execution including JIT compilation took 0.413867712020874s.
+[triton-dejavu] First execution including JIT compilation took 1.658043384552002s.
+[triton-dejavu] First execution including JIT compilation took 0.6655893325805664s.
+[triton-dejavu] First execution including JIT compilation took 0.4302644729614258s.
+[triton-dejavu] First execution including JIT compilation took 1.9427084922790527s.
+[triton-dejavu] First execution including JIT compilation took 0.8315591812133789s.
+[triton-dejavu] First execution including JIT compilation took 0.5458030700683594s.
+[triton-dejavu] First execution including JIT compilation took 1.938206672668457s.
+bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 354816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.1561598777770996s.
+[triton-dejavu] First execution including JIT compilation took 1.0098638534545898s.
+[triton-dejavu] First execution including JIT compilation took 0.4696693420410156s.
+[triton-dejavu] First execution including JIT compilation took 2.832549571990967s.
+[triton-dejavu] First execution including JIT compilation took 1.2741048336029053s.
+[triton-dejavu] First execution including JIT compilation took 0.5360772609710693s.
+[triton-dejavu] First execution including JIT compilation took 7.192165851593018s.
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 405504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 405504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 709632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 709632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.5908124446868896s.
+[triton-dejavu] First execution including JIT compilation took 0.41149401664733887s.
+[triton-dejavu] First execution including JIT compilation took 0.2559213638305664s.
+[triton-dejavu] First execution including JIT compilation took 0.7076609134674072s.
+[triton-dejavu] First execution including JIT compilation took 0.5295822620391846s.
+[triton-dejavu] First execution including JIT compilation took 0.31011199951171875s.
+[triton-dejavu] First execution including JIT compilation took 0.7750310897827148s.
+[triton-dejavu] First execution including JIT compilation took 0.4493274688720703s.
+[triton-dejavu] First execution including JIT compilation took 0.30690884590148926s.
+[triton-dejavu] First execution including JIT compilation took 0.7551653385162354s.
+[triton-dejavu] First execution including JIT compilation took 0.46668338775634766s.
+[triton-dejavu] First execution including JIT compilation took 0.30584287643432617s.
+[triton-dejavu] First execution including JIT compilation took 0.7725615501403809s.
+[triton-dejavu] First execution including JIT compilation took 0.482954740524292s.
+[triton-dejavu] First execution including JIT compilation took 0.3182220458984375s.
+[triton-dejavu] First execution including JIT compilation took 0.9150772094726562s.
+[triton-dejavu] First execution including JIT compilation took 0.5212767124176025s.
+[triton-dejavu] First execution including JIT compilation took 0.3300950527191162s.
+[triton-dejavu] First execution including JIT compilation took 1.053274393081665s.
+[triton-dejavu] First execution including JIT compilation took 0.5630724430084229s.
+[triton-dejavu] First execution including JIT compilation took 0.3730814456939697s.
+[triton-dejavu] First execution including JIT compilation took 0.79178786277771s.
+[triton-dejavu] First execution including JIT compilation took 0.46175098419189453s.
+[triton-dejavu] First execution including JIT compilation took 0.28571319580078125s.
+[triton-dejavu] First execution including JIT compilation took 0.95066237449646s.
+[triton-dejavu] First execution including JIT compilation took 0.6377534866333008s.
+[triton-dejavu] First execution including JIT compilation took 0.35297322273254395s.
+[triton-dejavu] First execution including JIT compilation took 1.0828590393066406s.
+[triton-dejavu] First execution including JIT compilation took 0.6473112106323242s.
+[triton-dejavu] First execution including JIT compilation took 0.3665587902069092s.
+[triton-dejavu] First execution including JIT compilation took 1.2117927074432373s.
+[triton-dejavu] First execution including JIT compilation took 0.6923372745513916s.
+[triton-dejavu] First execution including JIT compilation took 0.39316701889038086s.
+[triton-dejavu] First execution including JIT compilation took 1.3109822273254395s.
+[triton-dejavu] First execution including JIT compilation took 0.6752035617828369s.
+[triton-dejavu] First execution including JIT compilation took 0.39464235305786133s.
+[triton-dejavu] First execution including JIT compilation took 1.4121100902557373s.
+[triton-dejavu] First execution including JIT compilation took 0.7038490772247314s.
+[triton-dejavu] First execution including JIT compilation took 0.46832728385925293s.
+[triton-dejavu] First execution including JIT compilation took 1.6874134540557861s.
+[triton-dejavu] First execution including JIT compilation took 0.8371837139129639s.
+[triton-dejavu] First execution including JIT compilation took 0.46314477920532227s.
+[triton-dejavu] First execution including JIT compilation took 1.2351336479187012s.
+[triton-dejavu] First execution including JIT compilation took 0.732062816619873s.
+[triton-dejavu] First execution including JIT compilation took 0.3664720058441162s.
+[triton-dejavu] First execution including JIT compilation took 1.3951785564422607s.
+[triton-dejavu] First execution including JIT compilation took 0.8160102367401123s.
+[triton-dejavu] First execution including JIT compilation took 0.4101998805999756s.
+[triton-dejavu] First execution including JIT compilation took 1.886866569519043s.
+[triton-dejavu] First execution including JIT compilation took 0.9539880752563477s.
+[triton-dejavu] First execution including JIT compilation took 0.4695587158203125s.
+[triton-dejavu] First execution including JIT compilation took 2.1268863677978516s.
+[triton-dejavu] First execution including JIT compilation took 1.127213716506958s.
+[triton-dejavu] First execution including JIT compilation took 0.5213239192962646s.
+[triton-dejavu] First execution including JIT compilation took 2.084219217300415s.
+[triton-dejavu] First execution including JIT compilation took 1.8083431720733643s.
+[triton-dejavu] First execution including JIT compilation took 0.7044923305511475s.
+[triton-dejavu] First execution including JIT compilation took 2.561204195022583s.
+bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 292096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 292096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.7049872875213623s.
+[triton-dejavu] First execution including JIT compilation took 1.3836045265197754s.
+[triton-dejavu] First execution including JIT compilation took 0.6024000644683838s.
+[triton-dejavu] First execution including JIT compilation took 3.13523006439209s.
+[triton-dejavu] First execution including JIT compilation took 2.1513431072235107s.
+[triton-dejavu] First execution including JIT compilation took 0.8262593746185303s.
+[triton-dejavu] First execution including JIT compilation took 7.682474613189697s.
+bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 250368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 417280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 584192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 584192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 7.4601991176605225s.
+[triton-dejavu] First execution including JIT compilation took 4.111113786697388s.
+[triton-dejavu] First execution including JIT compilation took 0.9548518657684326s.
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 500736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 500736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 834560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 834560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1168384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1168384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.22535085678100586s.
+[triton-dejavu] First execution including JIT compilation took 0.2475295066833496s.
+[triton-dejavu] First execution including JIT compilation took 0.16012358665466309s.
+[triton-dejavu] First execution including JIT compilation took 0.22483301162719727s.
+[triton-dejavu] First execution including JIT compilation took 0.21385669708251953s.
+[triton-dejavu] First execution including JIT compilation took 0.19730210304260254s.
+[triton-dejavu] First execution including JIT compilation took 0.2612948417663574s.
+[triton-dejavu] First execution including JIT compilation took 0.2422807216644287s.
+[triton-dejavu] First execution including JIT compilation took 0.2098245620727539s.
+[triton-dejavu] First execution including JIT compilation took 0.27469491958618164s.
+[triton-dejavu] First execution including JIT compilation took 0.23059582710266113s.
+[triton-dejavu] First execution including JIT compilation took 0.20798087120056152s.
+[triton-dejavu] First execution including JIT compilation took 0.27477025985717773s.
+[triton-dejavu] First execution including JIT compilation took 0.22785329818725586s.
+[triton-dejavu] First execution including JIT compilation took 0.22185730934143066s.
+[triton-dejavu] First execution including JIT compilation took 0.28423380851745605s.
+[triton-dejavu] First execution including JIT compilation took 0.2566359043121338s.
+[triton-dejavu] First execution including JIT compilation took 0.22713565826416016s.
+[triton-dejavu] First execution including JIT compilation took 0.3106961250305176s.
+[triton-dejavu] First execution including JIT compilation took 0.27785158157348633s.
+[triton-dejavu] First execution including JIT compilation took 0.24365854263305664s.
+[triton-dejavu] First execution including JIT compilation took 0.26142382621765137s.
+[triton-dejavu] First execution including JIT compilation took 0.19252419471740723s.
+[triton-dejavu] First execution including JIT compilation took 0.1860034465789795s.
+[triton-dejavu] First execution including JIT compilation took 0.2948489189147949s.
+[triton-dejavu] First execution including JIT compilation took 0.22476696968078613s.
+[triton-dejavu] First execution including JIT compilation took 0.26130104064941406s.
+[triton-dejavu] First execution including JIT compilation took 0.40126538276672363s.
+[triton-dejavu] First execution including JIT compilation took 0.29898667335510254s.
+[triton-dejavu] First execution including JIT compilation took 0.27327704429626465s.
+[triton-dejavu] First execution including JIT compilation took 0.41613197326660156s.
+[triton-dejavu] First execution including JIT compilation took 0.30518221855163574s.
+[triton-dejavu] First execution including JIT compilation took 0.28211188316345215s.
+[triton-dejavu] First execution including JIT compilation took 0.4438753128051758s.
+[triton-dejavu] First execution including JIT compilation took 0.3301517963409424s.
+[triton-dejavu] First execution including JIT compilation took 0.29430127143859863s.
+[triton-dejavu] First execution including JIT compilation took 0.4660637378692627s.
+[triton-dejavu] First execution including JIT compilation took 0.3478364944458008s.
+[triton-dejavu] First execution including JIT compilation took 0.30316758155822754s.
+[triton-dejavu] First execution including JIT compilation took 0.5299293994903564s.
+[triton-dejavu] First execution including JIT compilation took 0.3778262138366699s.
+[triton-dejavu] First execution including JIT compilation took 0.33774375915527344s.
+[triton-dejavu] First execution including JIT compilation took 0.4039268493652344s.
+[triton-dejavu] First execution including JIT compilation took 0.26682138442993164s.
+[triton-dejavu] First execution including JIT compilation took 0.22738170623779297s.
+[triton-dejavu] First execution including JIT compilation took 0.47388482093811035s.
+[triton-dejavu] First execution including JIT compilation took 0.3087284564971924s.
+[triton-dejavu] First execution including JIT compilation took 0.2632722854614258s.
+[triton-dejavu] First execution including JIT compilation took 0.5885961055755615s.
+[triton-dejavu] First execution including JIT compilation took 0.34126925468444824s.
+[triton-dejavu] First execution including JIT compilation took 0.29026103019714355s.
+[triton-dejavu] First execution including JIT compilation took 0.653205394744873s.
+[triton-dejavu] First execution including JIT compilation took 0.37291741371154785s.
+[triton-dejavu] First execution including JIT compilation took 0.30518603324890137s.
+[triton-dejavu] First execution including JIT compilation took 0.7140650749206543s.
+[triton-dejavu] First execution including JIT compilation took 0.4112815856933594s.
+[triton-dejavu] First execution including JIT compilation took 0.3285210132598877s.
+[triton-dejavu] First execution including JIT compilation took 0.7528486251831055s.
+[triton-dejavu] First execution including JIT compilation took 0.4412572383880615s.
+[triton-dejavu] First execution including JIT compilation took 0.3455331325531006s.
+[triton-dejavu] First execution including JIT compilation took 0.8772494792938232s.
+[triton-dejavu] First execution including JIT compilation took 0.4721558094024658s.
+[triton-dejavu] First execution including JIT compilation took 0.3847846984863281s.
+[triton-dejavu] First execution including JIT compilation took 0.6612024307250977s.
+[triton-dejavu] First execution including JIT compilation took 0.3503909111022949s.
+[triton-dejavu] First execution including JIT compilation took 0.25101804733276367s.
+[triton-dejavu] First execution including JIT compilation took 0.8109943866729736s.
+[triton-dejavu] First execution including JIT compilation took 0.38869762420654297s.
+[triton-dejavu] First execution including JIT compilation took 0.2938868999481201s.
+[triton-dejavu] First execution including JIT compilation took 1.3092761039733887s.
+[triton-dejavu] First execution including JIT compilation took 0.49189066886901855s.
+[triton-dejavu] First execution including JIT compilation took 0.3337218761444092s.
+[triton-dejavu] First execution including JIT compilation took 1.4332802295684814s.
+[triton-dejavu] First execution including JIT compilation took 0.509821891784668s.
+[triton-dejavu] First execution including JIT compilation took 0.3853490352630615s.
+[triton-dejavu] First execution including JIT compilation took 1.5309038162231445s.
+[triton-dejavu] First execution including JIT compilation took 0.5586769580841064s.
+[triton-dejavu] First execution including JIT compilation took 0.388033390045166s.
+[triton-dejavu] First execution including JIT compilation took 1.588334083557129s.
+[triton-dejavu] First execution including JIT compilation took 0.49126362800598145s.
+[triton-dejavu] First execution including JIT compilation took 0.34864377975463867s.
+bench_cudagraph failed with out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 305664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 305664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 305664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 305664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.4631285667419434s.
+[triton-dejavu] First execution including JIT compilation took 0.4619269371032715s.
+[triton-dejavu] First execution including JIT compilation took 0.29707860946655273s.
+[triton-dejavu] First execution including JIT compilation took 1.3915040493011475s.
+[triton-dejavu] First execution including JIT compilation took 0.5160026550292969s.
+[triton-dejavu] First execution including JIT compilation took 0.3659961223602295s.
+[triton-dejavu] First execution including JIT compilation took 5.161684989929199s.
+[triton-dejavu] First execution including JIT compilation took 1.0375711917877197s.
+[triton-dejavu] First execution including JIT compilation took 0.3804037570953369s.
+[triton-dejavu] First execution including JIT compilation took 5.337275505065918s.
+bench_cudagraph failed with out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 457728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 457728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 457728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 457728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 611328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 611328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 611328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 611328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.31734514236450195s.
+[triton-dejavu] First execution including JIT compilation took 0.2059774398803711s.
+[triton-dejavu] First execution including JIT compilation took 0.1928870677947998s.
+[triton-dejavu] First execution including JIT compilation took 0.33611416816711426s.
+[triton-dejavu] First execution including JIT compilation took 0.23422932624816895s.
+[triton-dejavu] First execution including JIT compilation took 0.21037578582763672s.
+[triton-dejavu] First execution including JIT compilation took 0.289534330368042s.
+[triton-dejavu] First execution including JIT compilation took 0.24709415435791016s.
+[triton-dejavu] First execution including JIT compilation took 0.2226250171661377s.
+[triton-dejavu] First execution including JIT compilation took 0.3181033134460449s.
+[triton-dejavu] First execution including JIT compilation took 0.2734520435333252s.
+[triton-dejavu] First execution including JIT compilation took 0.2189197540283203s.
+[triton-dejavu] First execution including JIT compilation took 0.3356783390045166s.
+[triton-dejavu] First execution including JIT compilation took 0.2723813056945801s.
+[triton-dejavu] First execution including JIT compilation took 0.2521681785583496s.
+[triton-dejavu] First execution including JIT compilation took 0.32643723487854004s.
+[triton-dejavu] First execution including JIT compilation took 0.278456449508667s.
+[triton-dejavu] First execution including JIT compilation took 0.25096702575683594s.
+[triton-dejavu] First execution including JIT compilation took 0.35304760932922363s.
+[triton-dejavu] First execution including JIT compilation took 0.29747843742370605s.
+[triton-dejavu] First execution including JIT compilation took 0.2503805160522461s.
+[triton-dejavu] First execution including JIT compilation took 0.3057088851928711s.
+[triton-dejavu] First execution including JIT compilation took 0.2160186767578125s.
+[triton-dejavu] First execution including JIT compilation took 0.1883528232574463s.
+[triton-dejavu] First execution including JIT compilation took 0.3313779830932617s.
+[triton-dejavu] First execution including JIT compilation took 0.24627685546875s.
+[triton-dejavu] First execution including JIT compilation took 0.201185941696167s.
+[triton-dejavu] First execution including JIT compilation took 0.3443264961242676s.
+[triton-dejavu] First execution including JIT compilation took 0.2596099376678467s.
+[triton-dejavu] First execution including JIT compilation took 0.23357057571411133s.
+[triton-dejavu] First execution including JIT compilation took 0.42798876762390137s.
+[triton-dejavu] First execution including JIT compilation took 0.30511474609375s.
+[triton-dejavu] First execution including JIT compilation took 0.24922823905944824s.
+[triton-dejavu] First execution including JIT compilation took 0.4275035858154297s.
+[triton-dejavu] First execution including JIT compilation took 0.3170912265777588s.
+[triton-dejavu] First execution including JIT compilation took 0.25102734565734863s.
+[triton-dejavu] First execution including JIT compilation took 0.4548606872558594s.
+[triton-dejavu] First execution including JIT compilation took 0.2932870388031006s.
+[triton-dejavu] First execution including JIT compilation took 0.25251173973083496s.
+[triton-dejavu] First execution including JIT compilation took 0.5132782459259033s.
+[triton-dejavu] First execution including JIT compilation took 0.3854689598083496s.
+[triton-dejavu] First execution including JIT compilation took 0.276400089263916s.
+[triton-dejavu] First execution including JIT compilation took 0.3926353454589844s.
+[triton-dejavu] First execution including JIT compilation took 0.24996232986450195s.
+[triton-dejavu] First execution including JIT compilation took 0.21382498741149902s.
+[triton-dejavu] First execution including JIT compilation took 0.4578080177307129s.
+[triton-dejavu] First execution including JIT compilation took 0.29611897468566895s.
+[triton-dejavu] First execution including JIT compilation took 0.2173306941986084s.
+[triton-dejavu] First execution including JIT compilation took 0.548072099685669s.
+[triton-dejavu] First execution including JIT compilation took 0.33872079849243164s.
+[triton-dejavu] First execution including JIT compilation took 0.23550057411193848s.
+[triton-dejavu] First execution including JIT compilation took 0.5951023101806641s.
+[triton-dejavu] First execution including JIT compilation took 0.349484920501709s.
+[triton-dejavu] First execution including JIT compilation took 0.25032520294189453s.
+[triton-dejavu] First execution including JIT compilation took 0.6920459270477295s.
+[triton-dejavu] First execution including JIT compilation took 0.3887183666229248s.
+[triton-dejavu] First execution including JIT compilation took 0.27884531021118164s.
+[triton-dejavu] First execution including JIT compilation took 0.6650998592376709s.
+[triton-dejavu] First execution including JIT compilation took 0.38024473190307617s.
+[triton-dejavu] First execution including JIT compilation took 0.293820858001709s.
+[triton-dejavu] First execution including JIT compilation took 0.7408139705657959s.
+[triton-dejavu] First execution including JIT compilation took 0.45377397537231445s.
+[triton-dejavu] First execution including JIT compilation took 0.3186800479888916s.
+[triton-dejavu] First execution including JIT compilation took 0.7443890571594238s.
+[triton-dejavu] First execution including JIT compilation took 0.3219418525695801s.
+[triton-dejavu] First execution including JIT compilation took 0.23611903190612793s.
+[triton-dejavu] First execution including JIT compilation took 0.7835826873779297s.
+[triton-dejavu] First execution including JIT compilation took 0.3625168800354004s.
+[triton-dejavu] First execution including JIT compilation took 0.2934072017669678s.
+[triton-dejavu] First execution including JIT compilation took 1.3723728656768799s.
+[triton-dejavu] First execution including JIT compilation took 0.5136263370513916s.
+[triton-dejavu] First execution including JIT compilation took 0.3561995029449463s.
+[triton-dejavu] First execution including JIT compilation took 1.3893115520477295s.
+[triton-dejavu] First execution including JIT compilation took 0.4639883041381836s.
+[triton-dejavu] First execution including JIT compilation took 0.32212138175964355s.
+[triton-dejavu] First execution including JIT compilation took 1.5799453258514404s.
+[triton-dejavu] First execution including JIT compilation took 0.519599199295044s.
+[triton-dejavu] First execution including JIT compilation took 0.34169602394104004s.
+[triton-dejavu] First execution including JIT compilation took 1.520521640777588s.
+bench_cudagraph failed with out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 338432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 338432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 338432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 338432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.3502428531646729s.
+[triton-dejavu] First execution including JIT compilation took 0.5446245670318604s.
+[triton-dejavu] First execution including JIT compilation took 0.3224365711212158s.
+[triton-dejavu] First execution including JIT compilation took 1.8596394062042236s.
+[triton-dejavu] First execution including JIT compilation took 0.6688938140869141s.
+[triton-dejavu] First execution including JIT compilation took 0.37556910514831543s.
+[triton-dejavu] First execution including JIT compilation took 5.96744441986084s.
+bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 336896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 336896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 336896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 336896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 676864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 676864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 676864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 676864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.3655416965484619s.
+[triton-dejavu] First execution including JIT compilation took 0.23999977111816406s.
+[triton-dejavu] First execution including JIT compilation took 0.19980168342590332s.
+[triton-dejavu] First execution including JIT compilation took 0.3963167667388916s.
+[triton-dejavu] First execution including JIT compilation took 0.29250240325927734s.
+[triton-dejavu] First execution including JIT compilation took 0.23019075393676758s.
+[triton-dejavu] First execution including JIT compilation took 0.3897242546081543s.
+[triton-dejavu] First execution including JIT compilation took 0.26331114768981934s.
+[triton-dejavu] First execution including JIT compilation took 0.25720906257629395s.
+[triton-dejavu] First execution including JIT compilation took 0.5272367000579834s.
+[triton-dejavu] First execution including JIT compilation took 0.3620729446411133s.
+[triton-dejavu] First execution including JIT compilation took 0.30249667167663574s.
+[triton-dejavu] First execution including JIT compilation took 0.5528614521026611s.
+[triton-dejavu] First execution including JIT compilation took 0.38202786445617676s.
+[triton-dejavu] First execution including JIT compilation took 0.32003331184387207s.
+[triton-dejavu] First execution including JIT compilation took 0.5820906162261963s.
+[triton-dejavu] First execution including JIT compilation took 0.3516204357147217s.
+[triton-dejavu] First execution including JIT compilation took 0.2539525032043457s.
+[triton-dejavu] First execution including JIT compilation took 0.5132083892822266s.
+[triton-dejavu] First execution including JIT compilation took 0.3485991954803467s.
+[triton-dejavu] First execution including JIT compilation took 0.26761674880981445s.
+[triton-dejavu] First execution including JIT compilation took 0.3998754024505615s.
+[triton-dejavu] First execution including JIT compilation took 0.270932674407959s.
+[triton-dejavu] First execution including JIT compilation took 0.21268010139465332s.
+[triton-dejavu] First execution including JIT compilation took 0.4931457042694092s.
+[triton-dejavu] First execution including JIT compilation took 0.3084697723388672s.
+[triton-dejavu] First execution including JIT compilation took 0.22578716278076172s.
+[triton-dejavu] First execution including JIT compilation took 0.4800398349761963s.
+[triton-dejavu] First execution including JIT compilation took 0.3248765468597412s.
+[triton-dejavu] First execution including JIT compilation took 0.25438714027404785s.
+[triton-dejavu] First execution including JIT compilation took 0.5268030166625977s.
+[triton-dejavu] First execution including JIT compilation took 0.32793354988098145s.
+[triton-dejavu] First execution including JIT compilation took 0.2654423713684082s.
+[triton-dejavu] First execution including JIT compilation took 0.5680561065673828s.
+[triton-dejavu] First execution including JIT compilation took 0.3322784900665283s.
+[triton-dejavu] First execution including JIT compilation took 0.25258684158325195s.
+[triton-dejavu] First execution including JIT compilation took 0.5792534351348877s.
+[triton-dejavu] First execution including JIT compilation took 0.5247256755828857s.
+[triton-dejavu] First execution including JIT compilation took 0.3439359664916992s.
+[triton-dejavu] First execution including JIT compilation took 0.8489353656768799s.
+[triton-dejavu] First execution including JIT compilation took 0.5044565200805664s.
+[triton-dejavu] First execution including JIT compilation took 0.39157629013061523s.
+[triton-dejavu] First execution including JIT compilation took 0.733513355255127s.
+[triton-dejavu] First execution including JIT compilation took 0.38277316093444824s.
+[triton-dejavu] First execution including JIT compilation took 0.2873697280883789s.
+[triton-dejavu] First execution including JIT compilation took 0.8169002532958984s.
+[triton-dejavu] First execution including JIT compilation took 0.3655128479003906s.
+[triton-dejavu] First execution including JIT compilation took 0.26145172119140625s.
+[triton-dejavu] First execution including JIT compilation took 0.8048985004425049s.
+[triton-dejavu] First execution including JIT compilation took 0.40337085723876953s.
+[triton-dejavu] First execution including JIT compilation took 0.2873227596282959s.
+[triton-dejavu] First execution including JIT compilation took 0.7874279022216797s.
+[triton-dejavu] First execution including JIT compilation took 0.4543600082397461s.
+[triton-dejavu] First execution including JIT compilation took 0.30629849433898926s.
+[triton-dejavu] First execution including JIT compilation took 1.1004579067230225s.
+[triton-dejavu] First execution including JIT compilation took 0.595219612121582s.
+[triton-dejavu] First execution including JIT compilation took 0.4115121364593506s.
+[triton-dejavu] First execution including JIT compilation took 1.1447741985321045s.
+[triton-dejavu] First execution including JIT compilation took 0.6449964046478271s.
+[triton-dejavu] First execution including JIT compilation took 0.42902207374572754s.
+[triton-dejavu] First execution including JIT compilation took 1.485217809677124s.
+[triton-dejavu] First execution including JIT compilation took 0.7232568264007568s.
+[triton-dejavu] First execution including JIT compilation took 0.478473424911499s.
+[triton-dejavu] First execution including JIT compilation took 1.249863862991333s.
+[triton-dejavu] First execution including JIT compilation took 0.5245099067687988s.
+[triton-dejavu] First execution including JIT compilation took 0.3504352569580078s.
+[triton-dejavu] First execution including JIT compilation took 1.5189287662506104s.
+[triton-dejavu] First execution including JIT compilation took 0.5968093872070312s.
+[triton-dejavu] First execution including JIT compilation took 0.4099123477935791s.
+[triton-dejavu] First execution including JIT compilation took 2.0371575355529785s.
+[triton-dejavu] First execution including JIT compilation took 0.5784683227539062s.
+[triton-dejavu] First execution including JIT compilation took 0.3639485836029053s.
+[triton-dejavu] First execution including JIT compilation took 2.0350635051727295s.
+[triton-dejavu] First execution including JIT compilation took 0.7109920978546143s.
+[triton-dejavu] First execution including JIT compilation took 0.4702615737915039s.
+[triton-dejavu] First execution including JIT compilation took 1.9885196685791016s.
+bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 354816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.343762159347534s.
+[triton-dejavu] First execution including JIT compilation took 0.7591912746429443s.
+[triton-dejavu] First execution including JIT compilation took 0.41748905181884766s.
+[triton-dejavu] First execution including JIT compilation took 2.6733522415161133s.
+[triton-dejavu] First execution including JIT compilation took 0.8401713371276855s.
+[triton-dejavu] First execution including JIT compilation took 0.44391798973083496s.
+[triton-dejavu] First execution including JIT compilation took 8.486325740814209s.
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 405504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 405504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 709632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 709632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.6625301837921143s.
+[triton-dejavu] First execution including JIT compilation took 0.4150726795196533s.
+[triton-dejavu] First execution including JIT compilation took 0.2472681999206543s.
+[triton-dejavu] First execution including JIT compilation took 0.6982488632202148s.
+[triton-dejavu] First execution including JIT compilation took 0.38659119606018066s.
+[triton-dejavu] First execution including JIT compilation took 0.2670154571533203s.
+[triton-dejavu] First execution including JIT compilation took 0.6746160984039307s.
+[triton-dejavu] First execution including JIT compilation took 0.38513994216918945s.
+[triton-dejavu] First execution including JIT compilation took 0.2773430347442627s.
+[triton-dejavu] First execution including JIT compilation took 0.7035007476806641s.
+[triton-dejavu] First execution including JIT compilation took 0.43756604194641113s.
+[triton-dejavu] First execution including JIT compilation took 0.27396464347839355s.
+[triton-dejavu] First execution including JIT compilation took 0.8079738616943359s.
+[triton-dejavu] First execution including JIT compilation took 0.4706554412841797s.
+[triton-dejavu] First execution including JIT compilation took 0.3690028190612793s.
+[triton-dejavu] First execution including JIT compilation took 1.0047783851623535s.
+[triton-dejavu] First execution including JIT compilation took 0.4361457824707031s.
+[triton-dejavu] First execution including JIT compilation took 0.364422082901001s.
+[triton-dejavu] First execution including JIT compilation took 0.8165838718414307s.
+[triton-dejavu] First execution including JIT compilation took 0.48160290718078613s.
+[triton-dejavu] First execution including JIT compilation took 0.3796987533569336s.
+[triton-dejavu] First execution including JIT compilation took 0.8447437286376953s.
+[triton-dejavu] First execution including JIT compilation took 0.3994133472442627s.
+[triton-dejavu] First execution including JIT compilation took 0.3070847988128662s.
+[triton-dejavu] First execution including JIT compilation took 0.8190915584564209s.
+[triton-dejavu] First execution including JIT compilation took 0.44321155548095703s.
+[triton-dejavu] First execution including JIT compilation took 0.30388951301574707s.
+[triton-dejavu] First execution including JIT compilation took 0.8760182857513428s.
+[triton-dejavu] First execution including JIT compilation took 0.498699426651001s.
+[triton-dejavu] First execution including JIT compilation took 0.33666563034057617s.
+[triton-dejavu] First execution including JIT compilation took 0.9385683536529541s.
+[triton-dejavu] First execution including JIT compilation took 0.49439096450805664s.
+[triton-dejavu] First execution including JIT compilation took 0.34059906005859375s.
+[triton-dejavu] First execution including JIT compilation took 0.9848973751068115s.
+[triton-dejavu] First execution including JIT compilation took 0.5026867389678955s.
+[triton-dejavu] First execution including JIT compilation took 0.3362538814544678s.
+[triton-dejavu] First execution including JIT compilation took 1.054696798324585s.
+[triton-dejavu] First execution including JIT compilation took 0.5362629890441895s.
+[triton-dejavu] First execution including JIT compilation took 0.3406381607055664s.
+[triton-dejavu] First execution including JIT compilation took 1.1980383396148682s.
+[triton-dejavu] First execution including JIT compilation took 0.6002733707427979s.
+[triton-dejavu] First execution including JIT compilation took 0.4032762050628662s.
+[triton-dejavu] First execution including JIT compilation took 1.0670430660247803s.
+[triton-dejavu] First execution including JIT compilation took 0.5054290294647217s.
+[triton-dejavu] First execution including JIT compilation took 0.31844234466552734s.
+[triton-dejavu] First execution including JIT compilation took 1.20845365524292s.
+[triton-dejavu] First execution including JIT compilation took 0.635533332824707s.
+[triton-dejavu] First execution including JIT compilation took 0.39752840995788574s.
+[triton-dejavu] First execution including JIT compilation took 1.2634165287017822s.
+[triton-dejavu] First execution including JIT compilation took 0.6931250095367432s.
+[triton-dejavu] First execution including JIT compilation took 0.3806438446044922s.
+[triton-dejavu] First execution including JIT compilation took 1.3524491786956787s.
+[triton-dejavu] First execution including JIT compilation took 0.6660432815551758s.
+[triton-dejavu] First execution including JIT compilation took 0.4266016483306885s.
+[triton-dejavu] First execution including JIT compilation took 1.3512389659881592s.
+[triton-dejavu] First execution including JIT compilation took 0.7300617694854736s.
+[triton-dejavu] First execution including JIT compilation took 0.4240868091583252s.
+[triton-dejavu] First execution including JIT compilation took 1.5339932441711426s.
+[triton-dejavu] First execution including JIT compilation took 0.7730631828308105s.
+[triton-dejavu] First execution including JIT compilation took 0.506572961807251s.
+bench_cudagraph failed with out of resource: shared memory, Required: 234752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 234752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 267520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 267520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 267520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 267520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.9073479175567627s.
+[triton-dejavu] First execution including JIT compilation took 0.7516419887542725s.
+[triton-dejavu] First execution including JIT compilation took 0.49443531036376953s.
+[triton-dejavu] First execution including JIT compilation took 2.24280047416687s.
+[triton-dejavu] First execution including JIT compilation took 0.8264782428741455s.
+[triton-dejavu] First execution including JIT compilation took 0.49369287490844727s.
+[triton-dejavu] First execution including JIT compilation took 3.268693208694458s.
+[triton-dejavu] First execution including JIT compilation took 0.9770853519439697s.
+[triton-dejavu] First execution including JIT compilation took 0.5697095394134521s.
+[triton-dejavu] First execution including JIT compilation took 3.2506027221679688s.
+bench_cudagraph failed with out of resource: shared memory, Required: 266752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 266752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 335360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 400896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 400896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 469504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 469504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 535040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 535040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 535040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 535040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 4.055661916732788s.
+[triton-dejavu] First execution including JIT compilation took 1.567265272140503s.
+[triton-dejavu] First execution including JIT compilation took 0.6991770267486572s.
+[triton-dejavu] First execution including JIT compilation took 4.434000730514526s.
+bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 533504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 533504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 533504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 533504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 536576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 536576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 670720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 670720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 801792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 801792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 801792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 801792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 939008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 939008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1070080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1070080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1070080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1070080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.2387633323669434s.
+[triton-dejavu] First execution including JIT compilation took 0.6717429161071777s.
+[triton-dejavu] First execution including JIT compilation took 0.4157521724700928s.
+[triton-dejavu] First execution including JIT compilation took 1.316270351409912s.
+[triton-dejavu] First execution including JIT compilation took 0.7596695423126221s.
+[triton-dejavu] First execution including JIT compilation took 0.4030904769897461s.
+[triton-dejavu] First execution including JIT compilation took 1.4970901012420654s.
+[triton-dejavu] First execution including JIT compilation took 0.9932441711425781s.
+[triton-dejavu] First execution including JIT compilation took 0.4329719543457031s.
+[triton-dejavu] First execution including JIT compilation took 1.4383361339569092s.
+[triton-dejavu] First execution including JIT compilation took 0.7822005748748779s.
+[triton-dejavu] First execution including JIT compilation took 0.5704188346862793s.
+[triton-dejavu] First execution including JIT compilation took 1.9332118034362793s.
+[triton-dejavu] First execution including JIT compilation took 1.031646728515625s.
+[triton-dejavu] First execution including JIT compilation took 0.5641162395477295s.
+[triton-dejavu] First execution including JIT compilation took 1.697702407836914s.
+[triton-dejavu] First execution including JIT compilation took 0.82403564453125s.
+[triton-dejavu] First execution including JIT compilation took 0.49923229217529297s.
+[triton-dejavu] First execution including JIT compilation took 1.6510090827941895s.
+[triton-dejavu] First execution including JIT compilation took 0.8686649799346924s.
+[triton-dejavu] First execution including JIT compilation took 0.49845027923583984s.
+[triton-dejavu] First execution including JIT compilation took 1.5030395984649658s.
+[triton-dejavu] First execution including JIT compilation took 1.132683277130127s.
+[triton-dejavu] First execution including JIT compilation took 0.5156295299530029s.
+[triton-dejavu] First execution including JIT compilation took 1.8180909156799316s.
+[triton-dejavu] First execution including JIT compilation took 0.8981871604919434s.
+[triton-dejavu] First execution including JIT compilation took 0.46332740783691406s.
+[triton-dejavu] First execution including JIT compilation took 2.3511245250701904s.
+[triton-dejavu] First execution including JIT compilation took 1.3827064037322998s.
+[triton-dejavu] First execution including JIT compilation took 0.6484944820404053s.
+[triton-dejavu] First execution including JIT compilation took 2.287813663482666s.
+[triton-dejavu] First execution including JIT compilation took 1.1250503063201904s.
+[triton-dejavu] First execution including JIT compilation took 0.5330066680908203s.
+[triton-dejavu] First execution including JIT compilation took 2.089167594909668s.
+[triton-dejavu] First execution including JIT compilation took 1.1511783599853516s.
+[triton-dejavu] First execution including JIT compilation took 0.5573456287384033s.
+[triton-dejavu] First execution including JIT compilation took 2.3208694458007812s.
+[triton-dejavu] First execution including JIT compilation took 1.2315797805786133s.
+[triton-dejavu] First execution including JIT compilation took 0.6272659301757812s.
+[triton-dejavu] First execution including JIT compilation took 2.580502510070801s.
+[triton-dejavu] First execution including JIT compilation took 1.3232295513153076s.
+[triton-dejavu] First execution including JIT compilation took 0.6254336833953857s.
+[triton-dejavu] First execution including JIT compilation took 2.4162003993988037s.
+[triton-dejavu] First execution including JIT compilation took 1.2951240539550781s.
+[triton-dejavu] First execution including JIT compilation took 0.6093685626983643s.
+[triton-dejavu] First execution including JIT compilation took 2.4826173782348633s.
+[triton-dejavu] First execution including JIT compilation took 1.444998025894165s.
+[triton-dejavu] First execution including JIT compilation took 0.6317059993743896s.
+[triton-dejavu] First execution including JIT compilation took 3.2404682636260986s.
+[triton-dejavu] First execution including JIT compilation took 1.592949628829956s.
+[triton-dejavu] First execution including JIT compilation took 0.7632882595062256s.
+[triton-dejavu] First execution including JIT compilation took 3.501128911972046s.
+[triton-dejavu] First execution including JIT compilation took 1.8167932033538818s.
+[triton-dejavu] First execution including JIT compilation took 0.7571108341217041s.
+[triton-dejavu] First execution including JIT compilation took 3.3915903568267822s.
+bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 349440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 349440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 4.791898488998413s.
+[triton-dejavu] First execution including JIT compilation took 2.434124231338501s.
+[triton-dejavu] First execution including JIT compilation took 1.172961711883545s.
+[triton-dejavu] First execution including JIT compilation took 5.012480020523071s.
+[triton-dejavu] First execution including JIT compilation took 2.740521192550659s.
+[triton-dejavu] First execution including JIT compilation took 1.2781014442443848s.
+[triton-dejavu] First execution including JIT compilation took 9.610878944396973s.
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 698880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 698880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+[triton-dejavu] First execution including JIT compilation took 7.976198434829712s.
+[triton-dejavu] First execution including JIT compilation took 3.3016257286071777s.
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 798720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 798720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1397760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1397760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.31671142578125s.
+[triton-dejavu] First execution including JIT compilation took 0.22868680953979492s.
+[triton-dejavu] First execution including JIT compilation took 0.18102025985717773s.
+[triton-dejavu] First execution including JIT compilation took 0.30118656158447266s.
+[triton-dejavu] First execution including JIT compilation took 0.23821210861206055s.
+[triton-dejavu] First execution including JIT compilation took 0.20787501335144043s.
+[triton-dejavu] First execution including JIT compilation took 0.32696962356567383s.
+[triton-dejavu] First execution including JIT compilation took 0.2887406349182129s.
+[triton-dejavu] First execution including JIT compilation took 0.21834921836853027s.
+[triton-dejavu] First execution including JIT compilation took 0.3684656620025635s.
+[triton-dejavu] First execution including JIT compilation took 0.2897188663482666s.
+[triton-dejavu] First execution including JIT compilation took 0.21413230895996094s.
+[triton-dejavu] First execution including JIT compilation took 0.4073657989501953s.
+[triton-dejavu] First execution including JIT compilation took 0.29150938987731934s.
+[triton-dejavu] First execution including JIT compilation took 0.24649262428283691s.
+[triton-dejavu] First execution including JIT compilation took 0.3873109817504883s.
+[triton-dejavu] First execution including JIT compilation took 0.3143148422241211s.
+[triton-dejavu] First execution including JIT compilation took 0.25432682037353516s.
+[triton-dejavu] First execution including JIT compilation took 0.4593379497528076s.
+[triton-dejavu] First execution including JIT compilation took 0.3326547145843506s.
+[triton-dejavu] First execution including JIT compilation took 0.27237915992736816s.
+[triton-dejavu] First execution including JIT compilation took 0.37383532524108887s.
+[triton-dejavu] First execution including JIT compilation took 0.2576146125793457s.
+[triton-dejavu] First execution including JIT compilation took 0.2171943187713623s.
+[triton-dejavu] First execution including JIT compilation took 0.3749668598175049s.
+[triton-dejavu] First execution including JIT compilation took 0.3074686527252197s.
+[triton-dejavu] First execution including JIT compilation took 0.2472078800201416s.
+[triton-dejavu] First execution including JIT compilation took 0.4607219696044922s.
+[triton-dejavu] First execution including JIT compilation took 0.2899644374847412s.
+[triton-dejavu] First execution including JIT compilation took 0.29875612258911133s.
+[triton-dejavu] First execution including JIT compilation took 0.59027099609375s.
+[triton-dejavu] First execution including JIT compilation took 0.333834171295166s.
+[triton-dejavu] First execution including JIT compilation took 0.26287293434143066s.
+[triton-dejavu] First execution including JIT compilation took 0.6803445816040039s.
+[triton-dejavu] First execution including JIT compilation took 0.3831348419189453s.
+[triton-dejavu] First execution including JIT compilation took 0.30806612968444824s.
+[triton-dejavu] First execution including JIT compilation took 0.5800254344940186s.
+[triton-dejavu] First execution including JIT compilation took 0.42940402030944824s.
+[triton-dejavu] First execution including JIT compilation took 0.3212895393371582s.
+[triton-dejavu] First execution including JIT compilation took 0.6800441741943359s.
+[triton-dejavu] First execution including JIT compilation took 0.4293978214263916s.
+[triton-dejavu] First execution including JIT compilation took 0.31106042861938477s.
+[triton-dejavu] First execution including JIT compilation took 0.5731973648071289s.
+[triton-dejavu] First execution including JIT compilation took 0.3158423900604248s.
+[triton-dejavu] First execution including JIT compilation took 0.21982431411743164s.
+[triton-dejavu] First execution including JIT compilation took 0.6607396602630615s.
+[triton-dejavu] First execution including JIT compilation took 0.33777403831481934s.
+[triton-dejavu] First execution including JIT compilation took 0.26319289207458496s.
+[triton-dejavu] First execution including JIT compilation took 1.0609164237976074s.
+[triton-dejavu] First execution including JIT compilation took 0.42955660820007324s.
+[triton-dejavu] First execution including JIT compilation took 0.29787397384643555s.
+[triton-dejavu] First execution including JIT compilation took 1.1868953704833984s.
+[triton-dejavu] First execution including JIT compilation took 0.44803476333618164s.
+[triton-dejavu] First execution including JIT compilation took 0.31003737449645996s.
+[triton-dejavu] First execution including JIT compilation took 1.2543339729309082s.
+[triton-dejavu] First execution including JIT compilation took 0.5254006385803223s.
+[triton-dejavu] First execution including JIT compilation took 0.32144618034362793s.
+[triton-dejavu] First execution including JIT compilation took 1.3773908615112305s.
+[triton-dejavu] First execution including JIT compilation took 0.5259160995483398s.
+[triton-dejavu] First execution including JIT compilation took 0.34156179428100586s.
+bench_cudagraph failed with out of resource: shared memory, Required: 249088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 283904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 283904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.1119396686553955s.
+[triton-dejavu] First execution including JIT compilation took 0.4642174243927002s.
+[triton-dejavu] First execution including JIT compilation took 0.2669200897216797s.
+[triton-dejavu] First execution including JIT compilation took 1.3371021747589111s.
+[triton-dejavu] First execution including JIT compilation took 0.49892330169677734s.
+[triton-dejavu] First execution including JIT compilation took 0.318439245223999s.
+[triton-dejavu] First execution including JIT compilation took 4.929638385772705s.
+[triton-dejavu] First execution including JIT compilation took 0.9740848541259766s.
+[triton-dejavu] First execution including JIT compilation took 0.37487220764160156s.
+[triton-dejavu] First execution including JIT compilation took 5.154336214065552s.
+bench_cudagraph failed with out of resource: shared memory, Required: 283136, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283136, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 283136, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283136, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 425472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 425472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 425472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 425472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 567808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 567808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 567808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 567808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 5.1173014640808105s.
+[triton-dejavu] First execution including JIT compilation took 1.0967254638671875s.
+[triton-dejavu] First execution including JIT compilation took 0.48938989639282227s.
+[triton-dejavu] First execution including JIT compilation took 3.199140787124634s.
+bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 566272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 566272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 566272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 566272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 850944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 850944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 850944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 850944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1135616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1135616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1135616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1135616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.5043489933013916s.
+[triton-dejavu] First execution including JIT compilation took 0.3277888298034668s.
+[triton-dejavu] First execution including JIT compilation took 0.24386119842529297s.
+[triton-dejavu] First execution including JIT compilation took 0.5100774765014648s.
+[triton-dejavu] First execution including JIT compilation took 0.34099698066711426s.
+[triton-dejavu] First execution including JIT compilation took 0.2779710292816162s.
+[triton-dejavu] First execution including JIT compilation took 0.5361578464508057s.
+[triton-dejavu] First execution including JIT compilation took 0.3682215213775635s.
+[triton-dejavu] First execution including JIT compilation took 0.300004243850708s.
+[triton-dejavu] First execution including JIT compilation took 0.5743675231933594s.
+[triton-dejavu] First execution including JIT compilation took 0.3822929859161377s.
+[triton-dejavu] First execution including JIT compilation took 0.3132593631744385s.
+[triton-dejavu] First execution including JIT compilation took 0.600147008895874s.
+[triton-dejavu] First execution including JIT compilation took 0.406186580657959s.
+[triton-dejavu] First execution including JIT compilation took 0.328277587890625s.
+[triton-dejavu] First execution including JIT compilation took 0.6361839771270752s.
+[triton-dejavu] First execution including JIT compilation took 0.4613196849822998s.
+[triton-dejavu] First execution including JIT compilation took 0.34471893310546875s.
+[triton-dejavu] First execution including JIT compilation took 0.7103567123413086s.
+[triton-dejavu] First execution including JIT compilation took 0.46198368072509766s.
+[triton-dejavu] First execution including JIT compilation took 0.37233877182006836s.
+[triton-dejavu] First execution including JIT compilation took 0.5790450572967529s.
+[triton-dejavu] First execution including JIT compilation took 0.33751773834228516s.
+[triton-dejavu] First execution including JIT compilation took 0.26027369499206543s.
+[triton-dejavu] First execution including JIT compilation took 0.6138906478881836s.
+[triton-dejavu] First execution including JIT compilation took 0.4593079090118408s.
+[triton-dejavu] First execution including JIT compilation took 0.29265832901000977s.
+[triton-dejavu] First execution including JIT compilation took 0.6543664932250977s.
+[triton-dejavu] First execution including JIT compilation took 0.40736937522888184s.
+[triton-dejavu] First execution including JIT compilation took 0.2809460163116455s.
+[triton-dejavu] First execution including JIT compilation took 0.7575297355651855s.
+[triton-dejavu] First execution including JIT compilation took 0.33360981941223145s.
+[triton-dejavu] First execution including JIT compilation took 0.31966447830200195s.
+[triton-dejavu] First execution including JIT compilation took 0.8111255168914795s.
+[triton-dejavu] First execution including JIT compilation took 0.46427249908447266s.
+[triton-dejavu] First execution including JIT compilation took 0.34816551208496094s.
+[triton-dejavu] First execution including JIT compilation took 0.9121909141540527s.
+[triton-dejavu] First execution including JIT compilation took 0.49946069717407227s.
+[triton-dejavu] First execution including JIT compilation took 0.3831624984741211s.
+[triton-dejavu] First execution including JIT compilation took 1.057861328125s.
+[triton-dejavu] First execution including JIT compilation took 0.5552551746368408s.
+[triton-dejavu] First execution including JIT compilation took 0.3699655532836914s.
+[triton-dejavu] First execution including JIT compilation took 0.8594727516174316s.
+[triton-dejavu] First execution including JIT compilation took 0.42781662940979004s.
+[triton-dejavu] First execution including JIT compilation took 0.31317567825317383s.
+[triton-dejavu] First execution including JIT compilation took 0.9985849857330322s.
+[triton-dejavu] First execution including JIT compilation took 0.5023458003997803s.
+[triton-dejavu] First execution including JIT compilation took 0.37673401832580566s.
+[triton-dejavu] First execution including JIT compilation took 1.501765251159668s.
+[triton-dejavu] First execution including JIT compilation took 0.5676426887512207s.
+[triton-dejavu] First execution including JIT compilation took 0.3674488067626953s.
+[triton-dejavu] First execution including JIT compilation took 1.6308376789093018s.
+[triton-dejavu] First execution including JIT compilation took 0.6221895217895508s.
+[triton-dejavu] First execution including JIT compilation took 0.42139220237731934s.
+[triton-dejavu] First execution including JIT compilation took 1.6980812549591064s.
+[triton-dejavu] First execution including JIT compilation took 0.6677892208099365s.
+[triton-dejavu] First execution including JIT compilation took 0.4159400463104248s.
+[triton-dejavu] First execution including JIT compilation took 1.8085997104644775s.
+[triton-dejavu] First execution including JIT compilation took 0.7202484607696533s.
+[triton-dejavu] First execution including JIT compilation took 0.4506070613861084s.
+bench_cudagraph failed with out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 300288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 300288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.7220134735107422s.
+[triton-dejavu] First execution including JIT compilation took 0.6867120265960693s.
+[triton-dejavu] First execution including JIT compilation took 0.4079298973083496s.
+[triton-dejavu] First execution including JIT compilation took 1.9503588676452637s.
+[triton-dejavu] First execution including JIT compilation took 0.7429883480072021s.
+[triton-dejavu] First execution including JIT compilation took 0.46311044692993164s.
+[triton-dejavu] First execution including JIT compilation took 5.910313367843628s.
+[triton-dejavu] First execution including JIT compilation took 1.2488391399383545s.
+[triton-dejavu] First execution including JIT compilation took 0.5487070083618164s.
+[triton-dejavu] First execution including JIT compilation took 5.962830066680908s.
+bench_cudagraph failed with out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 450048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 450048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 450048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 450048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 600576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 600576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 5.886913776397705s.
+[triton-dejavu] First execution including JIT compilation took 1.1668055057525635s.
+[triton-dejavu] First execution including JIT compilation took 0.5872712135314941s.
+[triton-dejavu] First execution including JIT compilation took 4.414681434631348s.
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 749568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 749568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 749568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 749568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 900096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 900096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 900096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 900096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1201152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1201152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1201152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1201152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.7140195369720459s.
+[triton-dejavu] First execution including JIT compilation took 0.36057424545288086s.
+[triton-dejavu] First execution including JIT compilation took 0.25351953506469727s.
+[triton-dejavu] First execution including JIT compilation took 0.6811349391937256s.
+[triton-dejavu] First execution including JIT compilation took 0.39322566986083984s.
+[triton-dejavu] First execution including JIT compilation took 0.26434326171875s.
+[triton-dejavu] First execution including JIT compilation took 0.7010984420776367s.
+[triton-dejavu] First execution including JIT compilation took 0.3896751403808594s.
+[triton-dejavu] First execution including JIT compilation took 0.28528761863708496s.
+[triton-dejavu] First execution including JIT compilation took 0.7317478656768799s.
+[triton-dejavu] First execution including JIT compilation took 0.4223208427429199s.
+[triton-dejavu] First execution including JIT compilation took 0.28917455673217773s.
+[triton-dejavu] First execution including JIT compilation took 0.744981050491333s.
+[triton-dejavu] First execution including JIT compilation took 0.3946702480316162s.
+[triton-dejavu] First execution including JIT compilation took 0.29891490936279297s.
+[triton-dejavu] First execution including JIT compilation took 0.765406608581543s.
+[triton-dejavu] First execution including JIT compilation took 0.44653844833374023s.
+[triton-dejavu] First execution including JIT compilation took 0.33998966217041016s.
+[triton-dejavu] First execution including JIT compilation took 0.8716061115264893s.
+[triton-dejavu] First execution including JIT compilation took 0.4519209861755371s.
+[triton-dejavu] First execution including JIT compilation took 0.348386287689209s.
+[triton-dejavu] First execution including JIT compilation took 1.0358567237854004s.
+[triton-dejavu] First execution including JIT compilation took 0.4894859790802002s.
+[triton-dejavu] First execution including JIT compilation took 0.3279075622558594s.
+[triton-dejavu] First execution including JIT compilation took 1.148808479309082s.
+[triton-dejavu] First execution including JIT compilation took 0.5393466949462891s.
+[triton-dejavu] First execution including JIT compilation took 0.3747735023498535s.
+[triton-dejavu] First execution including JIT compilation took 1.237614631652832s.
+[triton-dejavu] First execution including JIT compilation took 0.5807638168334961s.
+[triton-dejavu] First execution including JIT compilation took 0.3793628215789795s.
+[triton-dejavu] First execution including JIT compilation took 1.323664903640747s.
+[triton-dejavu] First execution including JIT compilation took 0.6247925758361816s.
+[triton-dejavu] First execution including JIT compilation took 0.39437222480773926s.
+[triton-dejavu] First execution including JIT compilation took 1.3928866386413574s.
+[triton-dejavu] First execution including JIT compilation took 0.6385958194732666s.
+[triton-dejavu] First execution including JIT compilation took 0.4033050537109375s.
+[triton-dejavu] First execution including JIT compilation took 1.440335988998413s.
+[triton-dejavu] First execution including JIT compilation took 0.5338249206542969s.
+[triton-dejavu] First execution including JIT compilation took 0.33176136016845703s.
+[triton-dejavu] First execution including JIT compilation took 1.2540497779846191s.
+[triton-dejavu] First execution including JIT compilation took 0.5932145118713379s.
+[triton-dejavu] First execution including JIT compilation took 0.35477566719055176s.
+[triton-dejavu] First execution including JIT compilation took 1.5448269844055176s.
+[triton-dejavu] First execution including JIT compilation took 0.6225264072418213s.
+[triton-dejavu] First execution including JIT compilation took 0.36580657958984375s.
+[triton-dejavu] First execution including JIT compilation took 1.6768112182617188s.
+[triton-dejavu] First execution including JIT compilation took 0.6222131252288818s.
+[triton-dejavu] First execution including JIT compilation took 0.42403435707092285s.
+[triton-dejavu] First execution including JIT compilation took 1.895324945449829s.
+[triton-dejavu] First execution including JIT compilation took 0.6491637229919434s.
+[triton-dejavu] First execution including JIT compilation took 0.49414658546447754s.
+[triton-dejavu] First execution including JIT compilation took 2.1658642292022705s.
+[triton-dejavu] First execution including JIT compilation took 0.9408359527587891s.
+[triton-dejavu] First execution including JIT compilation took 0.4878816604614258s.
+[triton-dejavu] First execution including JIT compilation took 2.3840792179107666s.
+[triton-dejavu] First execution including JIT compilation took 0.8270976543426514s.
+[triton-dejavu] First execution including JIT compilation took 0.41714978218078613s.
+[triton-dejavu] First execution including JIT compilation took 2.040860891342163s.
+bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 292096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 292096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.6213810443878174s.
+[triton-dejavu] First execution including JIT compilation took 0.9269452095031738s.
+[triton-dejavu] First execution including JIT compilation took 0.47311830520629883s.
+[triton-dejavu] First execution including JIT compilation took 2.7501637935638428s.
+[triton-dejavu] First execution including JIT compilation took 0.7629375457763672s.
+[triton-dejavu] First execution including JIT compilation took 0.4511408805847168s.
+[triton-dejavu] First execution including JIT compilation took 7.134660243988037s.
+bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 250368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 417280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 584192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 584192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 5.504925966262817s.
+[triton-dejavu] First execution including JIT compilation took 1.498021125793457s.
+[triton-dejavu] First execution including JIT compilation took 0.7205624580383301s.
+[triton-dejavu] First execution including JIT compilation took 6.149832725524902s.
+bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 500736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 500736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 834560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 834560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1168384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1168384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.4407587051391602s.
+[triton-dejavu] First execution including JIT compilation took 0.7037980556488037s.
+[triton-dejavu] First execution including JIT compilation took 0.4034006595611572s.
+[triton-dejavu] First execution including JIT compilation took 1.492032527923584s.
+[triton-dejavu] First execution including JIT compilation took 0.8152525424957275s.
+[triton-dejavu] First execution including JIT compilation took 0.3937671184539795s.
+[triton-dejavu] First execution including JIT compilation took 1.604053020477295s.
+[triton-dejavu] First execution including JIT compilation took 0.850435733795166s.
+[triton-dejavu] First execution including JIT compilation took 0.4098947048187256s.
+[triton-dejavu] First execution including JIT compilation took 1.6307311058044434s.
+[triton-dejavu] First execution including JIT compilation took 0.7785723209381104s.
+[triton-dejavu] First execution including JIT compilation took 0.4134221076965332s.
+[triton-dejavu] First execution including JIT compilation took 1.6844274997711182s.
+[triton-dejavu] First execution including JIT compilation took 0.8327662944793701s.
+[triton-dejavu] First execution including JIT compilation took 0.4714367389678955s.
+[triton-dejavu] First execution including JIT compilation took 2.1540465354919434s.
+[triton-dejavu] First execution including JIT compilation took 0.8299758434295654s.
+[triton-dejavu] First execution including JIT compilation took 0.5292987823486328s.
+[triton-dejavu] First execution including JIT compilation took 1.8861517906188965s.
+[triton-dejavu] First execution including JIT compilation took 0.8495028018951416s.
+[triton-dejavu] First execution including JIT compilation took 0.44466233253479004s.
+[triton-dejavu] First execution including JIT compilation took 1.6645276546478271s.
+[triton-dejavu] First execution including JIT compilation took 0.8646507263183594s.
+[triton-dejavu] First execution including JIT compilation took 0.3824446201324463s.
+[triton-dejavu] First execution including JIT compilation took 1.8299148082733154s.
+[triton-dejavu] First execution including JIT compilation took 0.8459672927856445s.
+[triton-dejavu] First execution including JIT compilation took 0.4378163814544678s.
+[triton-dejavu] First execution including JIT compilation took 1.9301397800445557s.
+[triton-dejavu] First execution including JIT compilation took 0.8685927391052246s.
+[triton-dejavu] First execution including JIT compilation took 0.49677586555480957s.
+[triton-dejavu] First execution including JIT compilation took 2.1058199405670166s.
+[triton-dejavu] First execution including JIT compilation took 0.9192090034484863s.
+[triton-dejavu] First execution including JIT compilation took 0.4858896732330322s.
+[triton-dejavu] First execution including JIT compilation took 2.163212776184082s.
+[triton-dejavu] First execution including JIT compilation took 0.9346041679382324s.
+[triton-dejavu] First execution including JIT compilation took 0.516742467880249s.
+[triton-dejavu] First execution including JIT compilation took 2.2323033809661865s.
+[triton-dejavu] First execution including JIT compilation took 1.0213954448699951s.
+[triton-dejavu] First execution including JIT compilation took 0.5393123626708984s.
+[triton-dejavu] First execution including JIT compilation took 2.4064090251922607s.
+[triton-dejavu] First execution including JIT compilation took 1.1745574474334717s.
+[triton-dejavu] First execution including JIT compilation took 0.5458128452301025s.
+[triton-dejavu] First execution including JIT compilation took 2.5389962196350098s.
+[triton-dejavu] First execution including JIT compilation took 1.0145437717437744s.
+[triton-dejavu] First execution including JIT compilation took 0.5042729377746582s.
+[triton-dejavu] First execution including JIT compilation took 2.685384511947632s.
+[triton-dejavu] First execution including JIT compilation took 1.0922198295593262s.
+[triton-dejavu] First execution including JIT compilation took 0.534543514251709s.
+[triton-dejavu] First execution including JIT compilation took 3.543609380722046s.
+[triton-dejavu] First execution including JIT compilation took 1.221311330795288s.
+[triton-dejavu] First execution including JIT compilation took 0.5585613250732422s.
+[triton-dejavu] First execution including JIT compilation took 3.604875087738037s.
+[triton-dejavu] First execution including JIT compilation took 1.3217031955718994s.
+[triton-dejavu] First execution including JIT compilation took 0.6215760707855225s.
+[triton-dejavu] First execution including JIT compilation took 3.679218053817749s.
+bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 349440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 349440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 5.055574655532837s.
+[triton-dejavu] First execution including JIT compilation took 1.6165587902069092s.
+[triton-dejavu] First execution including JIT compilation took 0.7123830318450928s.
+[triton-dejavu] First execution including JIT compilation took 5.444074630737305s.
+[triton-dejavu] First execution including JIT compilation took 1.641197681427002s.
+[triton-dejavu] First execution including JIT compilation took 0.818612813949585s.
+[triton-dejavu] First execution including JIT compilation took 10.92102336883545s.
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 698880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 698880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+[triton-dejavu] First execution including JIT compilation took 3.2677865028381348s.
+[triton-dejavu] First execution including JIT compilation took 1.392303705215454s.
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 798720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 798720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1397760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1397760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 3.4989428520202637s.
+[triton-dejavu] First execution including JIT compilation took 1.8932569026947021s.
+[triton-dejavu] First execution including JIT compilation took 0.732560396194458s.
+[triton-dejavu] First execution including JIT compilation took 3.6859214305877686s.
+[triton-dejavu] First execution including JIT compilation took 1.8280115127563477s.
+[triton-dejavu] First execution including JIT compilation took 0.7882623672485352s.
+[triton-dejavu] First execution including JIT compilation took 3.735793113708496s.
+[triton-dejavu] First execution including JIT compilation took 2.119565486907959s.
+[triton-dejavu] First execution including JIT compilation took 0.8172221183776855s.
+[triton-dejavu] First execution including JIT compilation took 4.001902341842651s.
+[triton-dejavu] First execution including JIT compilation took 1.949631690979004s.
+[triton-dejavu] First execution including JIT compilation took 0.8840606212615967s.
+[triton-dejavu] First execution including JIT compilation took 4.194988489151001s.
+[triton-dejavu] First execution including JIT compilation took 2.0192768573760986s.
+[triton-dejavu] First execution including JIT compilation took 0.8446464538574219s.
+[triton-dejavu] First execution including JIT compilation took 3.981654644012451s.
+[triton-dejavu] First execution including JIT compilation took 2.0983834266662598s.
+[triton-dejavu] First execution including JIT compilation took 0.9342923164367676s.
+[triton-dejavu] First execution including JIT compilation took 3.9925155639648438s.
+[triton-dejavu] First execution including JIT compilation took 2.2087132930755615s.
+[triton-dejavu] First execution including JIT compilation took 0.949195146560669s.
+[triton-dejavu] First execution including JIT compilation took 4.75992751121521s.
+[triton-dejavu] First execution including JIT compilation took 2.1755006313323975s.
+[triton-dejavu] First execution including JIT compilation took 1.1709823608398438s.
+[triton-dejavu] First execution including JIT compilation took 4.726720809936523s.
+[triton-dejavu] First execution including JIT compilation took 2.514338970184326s.
+[triton-dejavu] First execution including JIT compilation took 0.9337425231933594s.
+[triton-dejavu] First execution including JIT compilation took 5.192431449890137s.
+[triton-dejavu] First execution including JIT compilation took 2.444566488265991s.
+[triton-dejavu] First execution including JIT compilation took 0.9995050430297852s.
+[triton-dejavu] First execution including JIT compilation took 5.0464768409729s.
+[triton-dejavu] First execution including JIT compilation took 2.496889352798462s.
+[triton-dejavu] First execution including JIT compilation took 1.0279152393341064s.
+[triton-dejavu] First execution including JIT compilation took 5.1387224197387695s.
+[triton-dejavu] First execution including JIT compilation took 2.8983592987060547s.
+[triton-dejavu] First execution including JIT compilation took 1.048851490020752s.
+[triton-dejavu] First execution including JIT compilation took 5.0407774448394775s.
+[triton-dejavu] First execution including JIT compilation took 2.5578625202178955s.
+[triton-dejavu] First execution including JIT compilation took 1.0612678527832031s.
+bench_cudagraph failed with out of resource: shared memory, Required: 248448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 5.719822406768799s.
+[triton-dejavu] First execution including JIT compilation took 3.07711124420166s.
+[triton-dejavu] First execution including JIT compilation took 1.6147170066833496s.
+[triton-dejavu] First execution including JIT compilation took 6.198007345199585s.
+[triton-dejavu] First execution including JIT compilation took 3.1891562938690186s.
+[triton-dejavu] First execution including JIT compilation took 1.3675951957702637s.
+[triton-dejavu] First execution including JIT compilation took 6.984339952468872s.
+[triton-dejavu] First execution including JIT compilation took 3.410410165786743s.
+[triton-dejavu] First execution including JIT compilation took 1.380906581878662s.
+[triton-dejavu] First execution including JIT compilation took 7.323652982711792s.
+bench_cudagraph failed with out of resource: shared memory, Required: 264448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 331520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 397056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 397056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 464128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 464128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 529664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 529664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 529664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 529664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+[triton-dejavu] First execution including JIT compilation took 6.1288557052612305s.
+[triton-dejavu] First execution including JIT compilation took 2.2384567260742188s.
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 528896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 528896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 663040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 794112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 794112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 794112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 794112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 928256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 928256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1059328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1059328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1059328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1059328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1057792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1057792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1057792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1057792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1060864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1060864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1323008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1323008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1323008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1323008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1326080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1326080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1588224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1588224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1588224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1588224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1856512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1856512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2118656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2118656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2118656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2118656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] added BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 64, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None for _chunk_state_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default and key ('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')
+[2025-07-23 14:17:39] Triton autotuning for function _chunk_state_fwd_kernel finished after 9348.03s; best config selected: BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 64, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None with benchmark time 0.003924777265638113;  evaluated 2625 configurations;
+[triton-dejavu] ('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32') not in cache, starting to tune...
+[triton-dejavu] [2025-07-23 14:17:39]  Started benchmarking of 168 configurations... (use_bo: False, run: 0)
+[triton-dejavu] First execution including JIT compilation took 0.09894037246704102s.
+[triton-dejavu] First execution including JIT compilation took 0.09794259071350098s.
+[triton-dejavu] First execution including JIT compilation took 0.09133148193359375s.
+[triton-dejavu] First execution including JIT compilation took 0.09437870979309082s.
+[triton-dejavu] First execution including JIT compilation took 0.1375281810760498s.
+[triton-dejavu] First execution including JIT compilation took 0.08841657638549805s.
+[triton-dejavu] First execution including JIT compilation took 0.09586286544799805s.
+[triton-dejavu] First execution including JIT compilation took 0.0908660888671875s.
+[triton-dejavu] First execution including JIT compilation took 0.0903022289276123s.
+[triton-dejavu] First execution including JIT compilation took 0.11986613273620605s.
+[triton-dejavu] First execution including JIT compilation took 0.09716367721557617s.
+[triton-dejavu] First execution including JIT compilation took 0.10063719749450684s.
+[triton-dejavu] First execution including JIT compilation took 0.10032272338867188s.
+[triton-dejavu] First execution including JIT compilation took 0.1031808853149414s.
+[triton-dejavu] First execution including JIT compilation took 0.10322737693786621s.
+[triton-dejavu] First execution including JIT compilation took 0.09868526458740234s.
+[triton-dejavu] First execution including JIT compilation took 0.10200619697570801s.
+[triton-dejavu] First execution including JIT compilation took 0.10516095161437988s.
+[triton-dejavu] First execution including JIT compilation took 0.10310220718383789s.
+[triton-dejavu] First execution including JIT compilation took 0.10310649871826172s.
+[triton-dejavu] First execution including JIT compilation took 0.0993356704711914s.
+[triton-dejavu] First execution including JIT compilation took 0.0988612174987793s.
+[triton-dejavu] First execution including JIT compilation took 0.10302448272705078s.
+[triton-dejavu] First execution including JIT compilation took 0.10656142234802246s.
+[triton-dejavu] First execution including JIT compilation took 0.10141181945800781s.
+[triton-dejavu] First execution including JIT compilation took 0.09873580932617188s.
+[triton-dejavu] First execution including JIT compilation took 0.09862279891967773s.
+[triton-dejavu] First execution including JIT compilation took 0.10381484031677246s.
+[triton-dejavu] First execution including JIT compilation took 0.003009319305419922s.
+[triton-dejavu] First execution including JIT compilation took 0.0979304313659668s.
+[triton-dejavu] First execution including JIT compilation took 0.10118842124938965s.
+[triton-dejavu] First execution including JIT compilation took 0.1023719310760498s.
+[triton-dejavu] First execution including JIT compilation took 0.09882235527038574s.
+[triton-dejavu] First execution including JIT compilation took 0.10287618637084961s.
+[triton-dejavu] First execution including JIT compilation took 0.10065412521362305s.
+[triton-dejavu] First execution including JIT compilation took 0.1038670539855957s.
+[triton-dejavu] First execution including JIT compilation took 0.10204434394836426s.
+[triton-dejavu] First execution including JIT compilation took 0.10319113731384277s.
+[triton-dejavu] First execution including JIT compilation took 0.10232353210449219s.
+[triton-dejavu] First execution including JIT compilation took 0.09966063499450684s.
+[triton-dejavu] First execution including JIT compilation took 0.1052091121673584s.
+[triton-dejavu] First execution including JIT compilation took 0.09914350509643555s.
+[triton-dejavu] First execution including JIT compilation took 0.10142302513122559s.
+[triton-dejavu] First execution including JIT compilation took 0.10095047950744629s.
+[triton-dejavu] First execution including JIT compilation took 0.0988004207611084s.
+[triton-dejavu] First execution including JIT compilation took 0.10207676887512207s.
+[triton-dejavu] First execution including JIT compilation took 0.10127758979797363s.
+[triton-dejavu] First execution including JIT compilation took 0.10261845588684082s.
+[triton-dejavu] First execution including JIT compilation took 0.10390377044677734s.
+[triton-dejavu] First execution including JIT compilation took 0.0035130977630615234s.
+[triton-dejavu] First execution including JIT compilation took 0.10114288330078125s.
+[triton-dejavu] First execution including JIT compilation took 0.10671138763427734s.
+[triton-dejavu] First execution including JIT compilation took 0.09997725486755371s.
+[triton-dejavu] First execution including JIT compilation took 0.1009225845336914s.
+[triton-dejavu] First execution including JIT compilation took 0.1023416519165039s.
+[triton-dejavu] First execution including JIT compilation took 0.10103917121887207s.
+[triton-dejavu] First execution including JIT compilation took 0.10527372360229492s.
+[triton-dejavu] First execution including JIT compilation took 0.1026146411895752s.
+[triton-dejavu] First execution including JIT compilation took 0.09933710098266602s.
+[triton-dejavu] First execution including JIT compilation took 0.10023188591003418s.
+[triton-dejavu] First execution including JIT compilation took 0.10262489318847656s.
+[triton-dejavu] First execution including JIT compilation took 0.10224008560180664s.
+[triton-dejavu] First execution including JIT compilation took 0.1036233901977539s.
+[triton-dejavu] First execution including JIT compilation took 0.10158872604370117s.
+[triton-dejavu] First execution including JIT compilation took 0.10198545455932617s.
+[triton-dejavu] First execution including JIT compilation took 0.09837794303894043s.
+[triton-dejavu] First execution including JIT compilation took 0.10406804084777832s.
+[triton-dejavu] First execution including JIT compilation took 0.09899592399597168s.
+[triton-dejavu] First execution including JIT compilation took 0.10302305221557617s.
+[triton-dejavu] First execution including JIT compilation took 0.10536479949951172s.
+[triton-dejavu] First execution including JIT compilation took 0.0029289722442626953s.
+[triton-dejavu] First execution including JIT compilation took 0.10014629364013672s.
+[triton-dejavu] First execution including JIT compilation took 0.10506725311279297s.
+[triton-dejavu] First execution including JIT compilation took 0.10248923301696777s.
+[triton-dejavu] First execution including JIT compilation took 0.14687442779541016s.
+[triton-dejavu] First execution including JIT compilation took 0.10432600975036621s.
+[triton-dejavu] First execution including JIT compilation took 0.10353231430053711s.
+[triton-dejavu] First execution including JIT compilation took 0.09760165214538574s.
+[triton-dejavu] First execution including JIT compilation took 0.10556888580322266s.
+[triton-dejavu] First execution including JIT compilation took 0.10235834121704102s.
+[triton-dejavu] First execution including JIT compilation took 0.10123991966247559s.
+[triton-dejavu] First execution including JIT compilation took 0.10434556007385254s.
+[triton-dejavu] First execution including JIT compilation took 0.10260486602783203s.
+[triton-dejavu] First execution including JIT compilation took 0.09864091873168945s.
+[triton-dejavu] First execution including JIT compilation took 0.11648225784301758s.
+[triton-dejavu] First execution including JIT compilation took 0.10767412185668945s.
+[triton-dejavu] First execution including JIT compilation took 0.09799385070800781s.
+[triton-dejavu] First execution including JIT compilation took 0.12165379524230957s.
+[triton-dejavu] First execution including JIT compilation took 0.10323834419250488s.
+[triton-dejavu] First execution including JIT compilation took 0.1541898250579834s.
+[triton-dejavu] First execution including JIT compilation took 0.12132406234741211s.
+[triton-dejavu] First execution including JIT compilation took 0.0029497146606445312s.
+[triton-dejavu] First execution including JIT compilation took 0.10245823860168457s.
+[triton-dejavu] First execution including JIT compilation took 0.12180399894714355s.
+[triton-dejavu] First execution including JIT compilation took 0.10425543785095215s.
+[triton-dejavu] First execution including JIT compilation took 0.09953522682189941s.
+[triton-dejavu] First execution including JIT compilation took 0.11828899383544922s.
+[triton-dejavu] First execution including JIT compilation took 0.1033179759979248s.
+[triton-dejavu] First execution including JIT compilation took 0.09932780265808105s.
+[triton-dejavu] First execution including JIT compilation took 0.12465763092041016s.
+[triton-dejavu] First execution including JIT compilation took 0.10105299949645996s.
+[triton-dejavu] First execution including JIT compilation took 0.10276103019714355s.
+[triton-dejavu] First execution including JIT compilation took 0.11753654479980469s.
+[triton-dejavu] First execution including JIT compilation took 0.10199093818664551s.
+[triton-dejavu] First execution including JIT compilation took 0.10073208808898926s.
+[triton-dejavu] First execution including JIT compilation took 0.13183021545410156s.
+[triton-dejavu] First execution including JIT compilation took 0.11950993537902832s.
+[triton-dejavu] First execution including JIT compilation took 0.10588884353637695s.
+[triton-dejavu] First execution including JIT compilation took 0.12990760803222656s.
+[triton-dejavu] First execution including JIT compilation took 0.12318229675292969s.
+[triton-dejavu] First execution including JIT compilation took 0.10443115234375s.
+[triton-dejavu] First execution including JIT compilation took 0.1310744285583496s.
+[triton-dejavu] First execution including JIT compilation took 0.0029449462890625s.
+[triton-dejavu] First execution including JIT compilation took 0.10070252418518066s.
+[triton-dejavu] First execution including JIT compilation took 0.13354873657226562s.
+[triton-dejavu] First execution including JIT compilation took 0.1201629638671875s.
+[triton-dejavu] First execution including JIT compilation took 0.10523557662963867s.
+[triton-dejavu] First execution including JIT compilation took 0.12919878959655762s.
+[triton-dejavu] First execution including JIT compilation took 0.12137508392333984s.
+[triton-dejavu] First execution including JIT compilation took 0.10500240325927734s.
+[triton-dejavu] First execution including JIT compilation took 0.1276566982269287s.
+[triton-dejavu] First execution including JIT compilation took 0.12091207504272461s.
+[triton-dejavu] First execution including JIT compilation took 0.0987248420715332s.
+[triton-dejavu] First execution including JIT compilation took 0.12950658798217773s.
+[triton-dejavu] First execution including JIT compilation took 0.12207913398742676s.
+[triton-dejavu] First execution including JIT compilation took 0.10215497016906738s.
+[triton-dejavu] First execution including JIT compilation took 0.16368603706359863s.
+[triton-dejavu] First execution including JIT compilation took 0.1331336498260498s.
+[triton-dejavu] First execution including JIT compilation took 0.12007308006286621s.
+[triton-dejavu] First execution including JIT compilation took 0.1515827178955078s.
+[triton-dejavu] First execution including JIT compilation took 0.1267712116241455s.
+[triton-dejavu] First execution including JIT compilation took 0.12165069580078125s.
+[triton-dejavu] First execution including JIT compilation took 0.1454930305480957s.
+[triton-dejavu] First execution including JIT compilation took 0.003009319305419922s.
+[triton-dejavu] First execution including JIT compilation took 0.11997270584106445s.
+[triton-dejavu] First execution including JIT compilation took 0.15126681327819824s.
+[triton-dejavu] First execution including JIT compilation took 0.12771224975585938s.
+[triton-dejavu] First execution including JIT compilation took 0.11920666694641113s.
+[triton-dejavu] First execution including JIT compilation took 0.14812898635864258s.
+[triton-dejavu] First execution including JIT compilation took 0.13222432136535645s.
+[triton-dejavu] First execution including JIT compilation took 0.11958074569702148s.
+[triton-dejavu] First execution including JIT compilation took 0.15191054344177246s.
+[triton-dejavu] First execution including JIT compilation took 0.13075757026672363s.
+[triton-dejavu] First execution including JIT compilation took 0.11592960357666016s.
+[triton-dejavu] First execution including JIT compilation took 0.14753198623657227s.
+[triton-dejavu] First execution including JIT compilation took 0.1284317970275879s.
+[triton-dejavu] First execution including JIT compilation took 0.12173199653625488s.
+[triton-dejavu] First execution including JIT compilation took 0.18095922470092773s.
+[triton-dejavu] First execution including JIT compilation took 0.14812517166137695s.
+[triton-dejavu] First execution including JIT compilation took 0.1308438777923584s.
+[triton-dejavu] First execution including JIT compilation took 0.18799710273742676s.
+[triton-dejavu] First execution including JIT compilation took 0.14983463287353516s.
+[triton-dejavu] First execution including JIT compilation took 0.13013529777526855s.
+[triton-dejavu] First execution including JIT compilation took 0.22894525527954102s.
+[triton-dejavu] First execution including JIT compilation took 0.14696717262268066s.
+[triton-dejavu] First execution including JIT compilation took 0.1867072582244873s.
+[triton-dejavu] First execution including JIT compilation took 0.18308377265930176s.
+[triton-dejavu] First execution including JIT compilation took 0.15039896965026855s.
+[triton-dejavu] First execution including JIT compilation took 0.12948107719421387s.
+[triton-dejavu] First execution including JIT compilation took 0.1869828701019287s.
+[triton-dejavu] First execution including JIT compilation took 0.15206503868103027s.
+[triton-dejavu] First execution including JIT compilation took 0.12766575813293457s.
+[triton-dejavu] First execution including JIT compilation took 0.19213628768920898s.
+[triton-dejavu] First execution including JIT compilation took 0.14951205253601074s.
+[triton-dejavu] First execution including JIT compilation took 0.131011962890625s.
+[triton-dejavu] First execution including JIT compilation took 0.19015717506408691s.
+[triton-dejavu] First execution including JIT compilation took 0.14911913871765137s.
+[triton-dejavu] First execution including JIT compilation took 0.1303267478942871s.
+[triton-dejavu] added BLOCK_SIZE: 512, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None for _state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default and key ('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')
+[2025-07-23 14:22:15] Triton autotuning for function _state_passing_fwd_kernel finished after 275.26s; best config selected: BLOCK_SIZE: 512, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None with benchmark time 0.0030820679385215044;  evaluated 168 configurations;
+[triton-dejavu] ('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32') not in cache, starting to tune...
+[triton-dejavu] [2025-07-23 14:22:15]  Started benchmarking of 2625 configurations... (use_bo: False, run: 0)
+[triton-dejavu] First execution including JIT compilation took 0.1971287727355957s.
+[triton-dejavu] First execution including JIT compilation took 0.18145108222961426s.
+[triton-dejavu] First execution including JIT compilation took 0.18181228637695312s.
+[triton-dejavu] First execution including JIT compilation took 0.20481252670288086s.
+[triton-dejavu] First execution including JIT compilation took 0.19466614723205566s.
+[triton-dejavu] First execution including JIT compilation took 0.17426085472106934s.
+[triton-dejavu] First execution including JIT compilation took 0.21188688278198242s.
+[triton-dejavu] First execution including JIT compilation took 0.20443081855773926s.
+[triton-dejavu] First execution including JIT compilation took 0.18296051025390625s.
+[triton-dejavu] First execution including JIT compilation took 0.21415448188781738s.
+[triton-dejavu] First execution including JIT compilation took 0.20465874671936035s.
+[triton-dejavu] First execution including JIT compilation took 0.1801447868347168s.
+[triton-dejavu] First execution including JIT compilation took 0.21986842155456543s.
+[triton-dejavu] First execution including JIT compilation took 0.2162468433380127s.
+[triton-dejavu] First execution including JIT compilation took 0.17408537864685059s.
+[triton-dejavu] First execution including JIT compilation took 0.23129940032958984s.
+[triton-dejavu] First execution including JIT compilation took 0.22765421867370605s.
+[triton-dejavu] First execution including JIT compilation took 0.171464204788208s.
+[triton-dejavu] First execution including JIT compilation took 0.24284863471984863s.
+[triton-dejavu] First execution including JIT compilation took 0.2351231575012207s.
+[triton-dejavu] First execution including JIT compilation took 0.17095470428466797s.
+[triton-dejavu] First execution including JIT compilation took 0.19266152381896973s.
+[triton-dejavu] First execution including JIT compilation took 0.19104242324829102s.
+[triton-dejavu] First execution including JIT compilation took 0.1844947338104248s.
+[triton-dejavu] First execution including JIT compilation took 0.20961451530456543s.
+[triton-dejavu] First execution including JIT compilation took 0.20723509788513184s.
+[triton-dejavu] First execution including JIT compilation took 0.20163917541503906s.
+[triton-dejavu] First execution including JIT compilation took 0.22046804428100586s.
+[triton-dejavu] First execution including JIT compilation took 0.21246051788330078s.
+[triton-dejavu] First execution including JIT compilation took 0.21422886848449707s.
+[triton-dejavu] First execution including JIT compilation took 0.2310943603515625s.
+[triton-dejavu] First execution including JIT compilation took 0.22539591789245605s.
+[triton-dejavu] First execution including JIT compilation took 0.21231532096862793s.
+[triton-dejavu] First execution including JIT compilation took 0.23757481575012207s.
+[triton-dejavu] First execution including JIT compilation took 0.2188396453857422s.
+[triton-dejavu] First execution including JIT compilation took 0.22507905960083008s.
+[triton-dejavu] First execution including JIT compilation took 0.25089573860168457s.
+[triton-dejavu] First execution including JIT compilation took 0.2278902530670166s.
+[triton-dejavu] First execution including JIT compilation took 0.22785520553588867s.
+[triton-dejavu] First execution including JIT compilation took 0.2572140693664551s.
+[triton-dejavu] First execution including JIT compilation took 0.24475598335266113s.
+[triton-dejavu] First execution including JIT compilation took 0.275850772857666s.
+[triton-dejavu] First execution including JIT compilation took 0.21295976638793945s.
+[triton-dejavu] First execution including JIT compilation took 0.19115710258483887s.
+[triton-dejavu] First execution including JIT compilation took 0.18381786346435547s.
+[triton-dejavu] First execution including JIT compilation took 0.21824884414672852s.
+[triton-dejavu] First execution including JIT compilation took 0.2043604850769043s.
+[triton-dejavu] First execution including JIT compilation took 0.2003331184387207s.
+[triton-dejavu] First execution including JIT compilation took 0.2335672378540039s.
+[triton-dejavu] First execution including JIT compilation took 0.21690750122070312s.
+[triton-dejavu] First execution including JIT compilation took 0.18306660652160645s.
+[triton-dejavu] First execution including JIT compilation took 0.2298128604888916s.
+[triton-dejavu] First execution including JIT compilation took 0.19614720344543457s.
+[triton-dejavu] First execution including JIT compilation took 0.2060997486114502s.
+[triton-dejavu] First execution including JIT compilation took 0.21973228454589844s.
+[triton-dejavu] First execution including JIT compilation took 0.18016934394836426s.
+[triton-dejavu] First execution including JIT compilation took 0.20123672485351562s.
+[triton-dejavu] First execution including JIT compilation took 0.24709606170654297s.
+[triton-dejavu] First execution including JIT compilation took 0.20981693267822266s.
+[triton-dejavu] First execution including JIT compilation took 0.2014932632446289s.
+[triton-dejavu] First execution including JIT compilation took 0.25247669219970703s.
+[triton-dejavu] First execution including JIT compilation took 0.20742201805114746s.
+[triton-dejavu] First execution including JIT compilation took 0.23698949813842773s.
+[triton-dejavu] First execution including JIT compilation took 0.20906853675842285s.
+[triton-dejavu] First execution including JIT compilation took 0.19327425956726074s.
+[triton-dejavu] First execution including JIT compilation took 0.1958320140838623s.
+[triton-dejavu] First execution including JIT compilation took 0.22327661514282227s.
+[triton-dejavu] First execution including JIT compilation took 0.2055678367614746s.
+[triton-dejavu] First execution including JIT compilation took 0.22722506523132324s.
+[triton-dejavu] First execution including JIT compilation took 0.29752469062805176s.
+[triton-dejavu] First execution including JIT compilation took 0.24663901329040527s.
+[triton-dejavu] First execution including JIT compilation took 0.22910308837890625s.
+[triton-dejavu] First execution including JIT compilation took 0.30620813369750977s.
+[triton-dejavu] First execution including JIT compilation took 0.2616004943847656s.
+[triton-dejavu] First execution including JIT compilation took 0.24086618423461914s.
+[triton-dejavu] First execution including JIT compilation took 0.31242823600769043s.
+[triton-dejavu] First execution including JIT compilation took 0.26506876945495605s.
+[triton-dejavu] First execution including JIT compilation took 0.24354910850524902s.
+[triton-dejavu] First execution including JIT compilation took 0.33380126953125s.
+[triton-dejavu] First execution including JIT compilation took 0.27773475646972656s.
+[triton-dejavu] First execution including JIT compilation took 0.25086140632629395s.
+[triton-dejavu] First execution including JIT compilation took 0.3661017417907715s.
+[triton-dejavu] First execution including JIT compilation took 0.29815053939819336s.
+[triton-dejavu] First execution including JIT compilation took 0.26163578033447266s.
+[triton-dejavu] First execution including JIT compilation took 0.32303476333618164s.
+[triton-dejavu] First execution including JIT compilation took 0.26395726203918457s.
+[triton-dejavu] First execution including JIT compilation took 0.24637055397033691s.
+[triton-dejavu] First execution including JIT compilation took 0.35486674308776855s.
+[triton-dejavu] First execution including JIT compilation took 0.2868657112121582s.
+[triton-dejavu] First execution including JIT compilation took 0.25715160369873047s.
+[triton-dejavu] First execution including JIT compilation took 0.38285207748413086s.
+[triton-dejavu] First execution including JIT compilation took 0.31813502311706543s.
+[triton-dejavu] First execution including JIT compilation took 0.2733750343322754s.
+[triton-dejavu] First execution including JIT compilation took 0.4395263195037842s.
+[triton-dejavu] First execution including JIT compilation took 0.3358616828918457s.
+[triton-dejavu] First execution including JIT compilation took 0.29150915145874023s.
+[triton-dejavu] First execution including JIT compilation took 0.5277695655822754s.
+[triton-dejavu] First execution including JIT compilation took 0.3672621250152588s.
+[triton-dejavu] First execution including JIT compilation took 0.3096439838409424s.
+[triton-dejavu] First execution including JIT compilation took 0.5141489505767822s.
+[triton-dejavu] First execution including JIT compilation took 0.37669992446899414s.
+[triton-dejavu] First execution including JIT compilation took 0.311464786529541s.
+[triton-dejavu] First execution including JIT compilation took 0.5582582950592041s.
+[triton-dejavu] First execution including JIT compilation took 0.4001944065093994s.
+[triton-dejavu] First execution including JIT compilation took 0.3267343044281006s.
+[triton-dejavu] First execution including JIT compilation took 0.19158482551574707s.
+[triton-dejavu] First execution including JIT compilation took 0.18754005432128906s.
+[triton-dejavu] First execution including JIT compilation took 0.17347002029418945s.
+[triton-dejavu] First execution including JIT compilation took 0.20695137977600098s.
+[triton-dejavu] First execution including JIT compilation took 0.1933000087738037s.
+[triton-dejavu] First execution including JIT compilation took 0.18866968154907227s.
+[triton-dejavu] First execution including JIT compilation took 0.21515560150146484s.
+[triton-dejavu] First execution including JIT compilation took 0.22445225715637207s.
+[triton-dejavu] First execution including JIT compilation took 0.19047832489013672s.
+[triton-dejavu] First execution including JIT compilation took 0.21751952171325684s.
+[triton-dejavu] First execution including JIT compilation took 0.2095792293548584s.
+[triton-dejavu] First execution including JIT compilation took 0.1899867057800293s.
+[triton-dejavu] First execution including JIT compilation took 0.22539901733398438s.
+[triton-dejavu] First execution including JIT compilation took 0.214493989944458s.
+[triton-dejavu] First execution including JIT compilation took 0.19467592239379883s.
+[triton-dejavu] First execution including JIT compilation took 0.23510289192199707s.
+[triton-dejavu] First execution including JIT compilation took 0.23056340217590332s.
+[triton-dejavu] First execution including JIT compilation took 0.20400166511535645s.
+[triton-dejavu] First execution including JIT compilation took 0.25162243843078613s.
+[triton-dejavu] First execution including JIT compilation took 0.24202203750610352s.
+[triton-dejavu] First execution including JIT compilation took 0.2179243564605713s.
+[triton-dejavu] First execution including JIT compilation took 0.20107483863830566s.
+[triton-dejavu] First execution including JIT compilation took 0.19151616096496582s.
+[triton-dejavu] First execution including JIT compilation took 0.1852731704711914s.
+[triton-dejavu] First execution including JIT compilation took 0.2255268096923828s.
+[triton-dejavu] First execution including JIT compilation took 0.20557308197021484s.
+[triton-dejavu] First execution including JIT compilation took 0.19893908500671387s.
+[triton-dejavu] First execution including JIT compilation took 0.2369074821472168s.
+[triton-dejavu] First execution including JIT compilation took 0.22331023216247559s.
+[triton-dejavu] First execution including JIT compilation took 0.2091996669769287s.
+[triton-dejavu] First execution including JIT compilation took 0.2456064224243164s.
+[triton-dejavu] First execution including JIT compilation took 0.24354219436645508s.
+[triton-dejavu] First execution including JIT compilation took 0.2187485694885254s.
+[triton-dejavu] First execution including JIT compilation took 0.26705098152160645s.
+[triton-dejavu] First execution including JIT compilation took 0.22890710830688477s.
+[triton-dejavu] First execution including JIT compilation took 0.228562593460083s.
+[triton-dejavu] First execution including JIT compilation took 0.260115385055542s.
+[triton-dejavu] First execution including JIT compilation took 0.23436951637268066s.
+[triton-dejavu] First execution including JIT compilation took 0.24272942543029785s.
+[triton-dejavu] First execution including JIT compilation took 0.2728395462036133s.
+[triton-dejavu] First execution including JIT compilation took 0.25573110580444336s.
+[triton-dejavu] First execution including JIT compilation took 0.239990234375s.
+[triton-dejavu] First execution including JIT compilation took 0.23782968521118164s.
+[triton-dejavu] First execution including JIT compilation took 0.20571660995483398s.
+[triton-dejavu] First execution including JIT compilation took 0.1985173225402832s.
+[triton-dejavu] First execution including JIT compilation took 0.23909878730773926s.
+[triton-dejavu] First execution including JIT compilation took 0.21619272232055664s.
+[triton-dejavu] First execution including JIT compilation took 0.205078125s.
+[triton-dejavu] First execution including JIT compilation took 0.25579833984375s.
+[triton-dejavu] First execution including JIT compilation took 0.22826004028320312s.
+[triton-dejavu] First execution including JIT compilation took 0.21488690376281738s.
+[triton-dejavu] First execution including JIT compilation took 0.2719230651855469s.
+[triton-dejavu] First execution including JIT compilation took 0.237349271774292s.
+[triton-dejavu] First execution including JIT compilation took 0.22726154327392578s.
+[triton-dejavu] First execution including JIT compilation took 0.29409146308898926s.
+[triton-dejavu] First execution including JIT compilation took 0.2537970542907715s.
+[triton-dejavu] First execution including JIT compilation took 0.2349834442138672s.
+[triton-dejavu] First execution including JIT compilation took 0.3101375102996826s.
+[triton-dejavu] First execution including JIT compilation took 0.25778889656066895s.
+[triton-dejavu] First execution including JIT compilation took 0.2488398551940918s.
+[triton-dejavu] First execution including JIT compilation took 0.3380768299102783s.
+[triton-dejavu] First execution including JIT compilation took 0.24480175971984863s.
+[triton-dejavu] First execution including JIT compilation took 0.24767565727233887s.
+[triton-dejavu] First execution including JIT compilation took 0.24734854698181152s.
+[triton-dejavu] First execution including JIT compilation took 0.22959280014038086s.
+[triton-dejavu] First execution including JIT compilation took 0.19723773002624512s.
+[triton-dejavu] First execution including JIT compilation took 0.2590954303741455s.
+[triton-dejavu] First execution including JIT compilation took 0.23061442375183105s.
+[triton-dejavu] First execution including JIT compilation took 0.19670867919921875s.
+[triton-dejavu] First execution including JIT compilation took 0.3171346187591553s.
+[triton-dejavu] First execution including JIT compilation took 0.232527494430542s.
+[triton-dejavu] First execution including JIT compilation took 0.22737908363342285s.
+[triton-dejavu] First execution including JIT compilation took 0.3899686336517334s.
+[triton-dejavu] First execution including JIT compilation took 0.278536319732666s.
+[triton-dejavu] First execution including JIT compilation took 0.24887967109680176s.
+[triton-dejavu] First execution including JIT compilation took 0.40813517570495605s.
+[triton-dejavu] First execution including JIT compilation took 0.29118990898132324s.
+[triton-dejavu] First execution including JIT compilation took 0.25837135314941406s.
+[triton-dejavu] First execution including JIT compilation took 0.42389464378356934s.
+[triton-dejavu] First execution including JIT compilation took 0.3060779571533203s.
+[triton-dejavu] First execution including JIT compilation took 0.26450538635253906s.
+[triton-dejavu] First execution including JIT compilation took 0.49116992950439453s.
+[triton-dejavu] First execution including JIT compilation took 0.33746862411499023s.
+[triton-dejavu] First execution including JIT compilation took 0.28478169441223145s.
+[triton-dejavu] First execution including JIT compilation took 0.41333556175231934s.
+[triton-dejavu] First execution including JIT compilation took 0.29308557510375977s.
+[triton-dejavu] First execution including JIT compilation took 0.2590651512145996s.
+[triton-dejavu] First execution including JIT compilation took 0.4438004493713379s.
+[triton-dejavu] First execution including JIT compilation took 0.38523340225219727s.
+[triton-dejavu] First execution including JIT compilation took 0.2776186466217041s.
+[triton-dejavu] First execution including JIT compilation took 0.5478754043579102s.
+[triton-dejavu] First execution including JIT compilation took 0.3559560775756836s.
+[triton-dejavu] First execution including JIT compilation took 0.29976773262023926s.
+[triton-dejavu] First execution including JIT compilation took 0.5995876789093018s.
+[triton-dejavu] First execution including JIT compilation took 0.38107895851135254s.
+[triton-dejavu] First execution including JIT compilation took 0.3243865966796875s.
+[triton-dejavu] First execution including JIT compilation took 0.7119507789611816s.
+[triton-dejavu] First execution including JIT compilation took 0.4182438850402832s.
+[triton-dejavu] First execution including JIT compilation took 0.33512067794799805s.
+[triton-dejavu] First execution including JIT compilation took 0.7309103012084961s.
+[triton-dejavu] First execution including JIT compilation took 0.4567563533782959s.
+[triton-dejavu] First execution including JIT compilation took 0.3427090644836426s.
+[triton-dejavu] First execution including JIT compilation took 0.7929611206054688s.
+[triton-dejavu] First execution including JIT compilation took 0.7045941352844238s.
+[triton-dejavu] First execution including JIT compilation took 0.36629557609558105s.
+[triton-dejavu] First execution including JIT compilation took 0.22418737411499023s.
+[triton-dejavu] First execution including JIT compilation took 0.1955420970916748s.
+[triton-dejavu] First execution including JIT compilation took 0.44211864471435547s.
+[triton-dejavu] First execution including JIT compilation took 0.23611164093017578s.
+[triton-dejavu] First execution including JIT compilation took 0.36040568351745605s.
+[triton-dejavu] First execution including JIT compilation took 0.19045591354370117s.
+[triton-dejavu] First execution including JIT compilation took 0.24911093711853027s.
+[triton-dejavu] First execution including JIT compilation took 0.2074282169342041s.
+[triton-dejavu] First execution including JIT compilation took 0.19919967651367188s.
+[triton-dejavu] First execution including JIT compilation took 0.24654054641723633s.
+[triton-dejavu] First execution including JIT compilation took 0.21567964553833008s.
+[triton-dejavu] First execution including JIT compilation took 0.2518477439880371s.
+[triton-dejavu] First execution including JIT compilation took 0.2529888153076172s.
+[triton-dejavu] First execution including JIT compilation took 0.22897052764892578s.
+[triton-dejavu] First execution including JIT compilation took 0.2083446979522705s.
+[triton-dejavu] First execution including JIT compilation took 0.2635183334350586s.
+[triton-dejavu] First execution including JIT compilation took 0.2431652545928955s.
+[triton-dejavu] First execution including JIT compilation took 0.20887160301208496s.
+[triton-dejavu] First execution including JIT compilation took 0.29308581352233887s.
+[triton-dejavu] First execution including JIT compilation took 0.25936341285705566s.
+[triton-dejavu] First execution including JIT compilation took 0.22800016403198242s.
+[triton-dejavu] First execution including JIT compilation took 0.24914908409118652s.
+[triton-dejavu] First execution including JIT compilation took 0.21156573295593262s.
+[triton-dejavu] First execution including JIT compilation took 0.19665884971618652s.
+[triton-dejavu] First execution including JIT compilation took 0.2679252624511719s.
+[triton-dejavu] First execution including JIT compilation took 0.23529791831970215s.
+[triton-dejavu] First execution including JIT compilation took 0.21185588836669922s.
+[triton-dejavu] First execution including JIT compilation took 0.27913832664489746s.
+[triton-dejavu] First execution including JIT compilation took 0.2429966926574707s.
+[triton-dejavu] First execution including JIT compilation took 0.22402739524841309s.
+[triton-dejavu] First execution including JIT compilation took 0.28855395317077637s.
+[triton-dejavu] First execution including JIT compilation took 0.25331878662109375s.
+[triton-dejavu] First execution including JIT compilation took 0.2378528118133545s.
+[triton-dejavu] First execution including JIT compilation took 0.30858898162841797s.
+[triton-dejavu] First execution including JIT compilation took 0.2576262950897217s.
+[triton-dejavu] First execution including JIT compilation took 0.23561835289001465s.
+[triton-dejavu] First execution including JIT compilation took 0.3155839443206787s.
+[triton-dejavu] First execution including JIT compilation took 0.26711153984069824s.
+[triton-dejavu] First execution including JIT compilation took 0.241225004196167s.
+[triton-dejavu] First execution including JIT compilation took 0.3228023052215576s.
+[triton-dejavu] First execution including JIT compilation took 0.2799217700958252s.
+[triton-dejavu] First execution including JIT compilation took 0.2530026435852051s.
+[triton-dejavu] First execution including JIT compilation took 0.2789144515991211s.
+[triton-dejavu] First execution including JIT compilation took 0.23478245735168457s.
+[triton-dejavu] First execution including JIT compilation took 0.20740914344787598s.
+[triton-dejavu] First execution including JIT compilation took 0.29624366760253906s.
+[triton-dejavu] First execution including JIT compilation took 0.24002695083618164s.
+[triton-dejavu] First execution including JIT compilation took 0.21425414085388184s.
+[triton-dejavu] First execution including JIT compilation took 0.3217945098876953s.
+[triton-dejavu] First execution including JIT compilation took 0.2628786563873291s.
+[triton-dejavu] First execution including JIT compilation took 0.22865581512451172s.
+[triton-dejavu] First execution including JIT compilation took 0.3735010623931885s.
+[triton-dejavu] First execution including JIT compilation took 0.27600812911987305s.
+[triton-dejavu] First execution including JIT compilation took 0.24083590507507324s.
+[triton-dejavu] First execution including JIT compilation took 0.37182092666625977s.
+[triton-dejavu] First execution including JIT compilation took 0.2857823371887207s.
+[triton-dejavu] First execution including JIT compilation took 0.2504265308380127s.
+[triton-dejavu] First execution including JIT compilation took 0.39750146865844727s.
+[triton-dejavu] First execution including JIT compilation took 0.30864596366882324s.
+[triton-dejavu] First execution including JIT compilation took 0.2660682201385498s.
+[triton-dejavu] First execution including JIT compilation took 0.4444742202758789s.
+[triton-dejavu] First execution including JIT compilation took 0.33436083793640137s.
+[triton-dejavu] First execution including JIT compilation took 0.27864575386047363s.
+[triton-dejavu] First execution including JIT compilation took 0.3996555805206299s.
+[triton-dejavu] First execution including JIT compilation took 0.29685044288635254s.
+[triton-dejavu] First execution including JIT compilation took 0.25818443298339844s.
+[triton-dejavu] First execution including JIT compilation took 0.4117124080657959s.
+[triton-dejavu] First execution including JIT compilation took 0.3763446807861328s.
+[triton-dejavu] First execution including JIT compilation took 0.24976181983947754s.
+[triton-dejavu] First execution including JIT compilation took 0.491518497467041s.
+[triton-dejavu] First execution including JIT compilation took 0.3266887664794922s.
+[triton-dejavu] First execution including JIT compilation took 0.2635648250579834s.
+[triton-dejavu] First execution including JIT compilation took 0.5234048366546631s.
+[triton-dejavu] First execution including JIT compilation took 0.3361480236053467s.
+[triton-dejavu] First execution including JIT compilation took 0.27045202255249023s.
+[triton-dejavu] First execution including JIT compilation took 0.5627446174621582s.
+[triton-dejavu] First execution including JIT compilation took 0.356564998626709s.
+[triton-dejavu] First execution including JIT compilation took 0.288219690322876s.
+[triton-dejavu] First execution including JIT compilation took 0.5783913135528564s.
+[triton-dejavu] First execution including JIT compilation took 0.3844006061553955s.
+[triton-dejavu] First execution including JIT compilation took 0.2928352355957031s.
+[triton-dejavu] First execution including JIT compilation took 0.6556406021118164s.
+[triton-dejavu] First execution including JIT compilation took 0.41593003273010254s.
+[triton-dejavu] First execution including JIT compilation took 0.3149592876434326s.
+[triton-dejavu] First execution including JIT compilation took 0.6277866363525391s.
+[triton-dejavu] First execution including JIT compilation took 0.36339282989501953s.
+[triton-dejavu] First execution including JIT compilation took 0.27582693099975586s.
+[triton-dejavu] First execution including JIT compilation took 0.6630880832672119s.
+[triton-dejavu] First execution including JIT compilation took 0.3871643543243408s.
+[triton-dejavu] First execution including JIT compilation took 0.29570579528808594s.
+[triton-dejavu] First execution including JIT compilation took 1.1634502410888672s.
+[triton-dejavu] First execution including JIT compilation took 0.5136749744415283s.
+[triton-dejavu] First execution including JIT compilation took 0.34021830558776855s.
+[triton-dejavu] First execution including JIT compilation took 1.281764268875122s.
+[triton-dejavu] First execution including JIT compilation took 0.5489327907562256s.
+[triton-dejavu] First execution including JIT compilation took 0.36876344680786133s.
+[triton-dejavu] First execution including JIT compilation took 1.3639161586761475s.
+[triton-dejavu] First execution including JIT compilation took 0.6005148887634277s.
+[triton-dejavu] First execution including JIT compilation took 0.3901093006134033s.
+[triton-dejavu] First execution including JIT compilation took 1.4384934902191162s.
+[triton-dejavu] First execution including JIT compilation took 0.6145901679992676s.
+[triton-dejavu] First execution including JIT compilation took 0.444568395614624s.
+bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.257260799407959s.
+[triton-dejavu] First execution including JIT compilation took 0.21925759315490723s.
+[triton-dejavu] First execution including JIT compilation took 0.19404125213623047s.
+[triton-dejavu] First execution including JIT compilation took 0.27055978775024414s.
+[triton-dejavu] First execution including JIT compilation took 0.23495268821716309s.
+[triton-dejavu] First execution including JIT compilation took 0.2018728256225586s.
+[triton-dejavu] First execution including JIT compilation took 0.28210949897766113s.
+[triton-dejavu] First execution including JIT compilation took 0.24408984184265137s.
+[triton-dejavu] First execution including JIT compilation took 0.20690345764160156s.
+[triton-dejavu] First execution including JIT compilation took 0.2943837642669678s.
+[triton-dejavu] First execution including JIT compilation took 0.2537810802459717s.
+[triton-dejavu] First execution including JIT compilation took 0.2101125717163086s.
+[triton-dejavu] First execution including JIT compilation took 0.32114124298095703s.
+[triton-dejavu] First execution including JIT compilation took 0.2669949531555176s.
+[triton-dejavu] First execution including JIT compilation took 0.2184464931488037s.
+[triton-dejavu] First execution including JIT compilation took 0.3285989761352539s.
+[triton-dejavu] First execution including JIT compilation took 0.2849769592285156s.
+[triton-dejavu] First execution including JIT compilation took 0.22274112701416016s.
+[triton-dejavu] First execution including JIT compilation took 0.35292792320251465s.
+[triton-dejavu] First execution including JIT compilation took 0.30437779426574707s.
+[triton-dejavu] First execution including JIT compilation took 0.2387676239013672s.
+[triton-dejavu] First execution including JIT compilation took 0.30469393730163574s.
+[triton-dejavu] First execution including JIT compilation took 0.2520115375518799s.
+[triton-dejavu] First execution including JIT compilation took 0.2158830165863037s.
+[triton-dejavu] First execution including JIT compilation took 0.3266003131866455s.
+[triton-dejavu] First execution including JIT compilation took 0.3258554935455322s.
+[triton-dejavu] First execution including JIT compilation took 0.23098182678222656s.
+[triton-dejavu] First execution including JIT compilation took 0.32482099533081055s.
+[triton-dejavu] First execution including JIT compilation took 0.2595548629760742s.
+[triton-dejavu] First execution including JIT compilation took 0.22449946403503418s.
+[triton-dejavu] First execution including JIT compilation took 0.31243300437927246s.
+[triton-dejavu] First execution including JIT compilation took 0.29460978507995605s.
+[triton-dejavu] First execution including JIT compilation took 0.23943471908569336s.
+[triton-dejavu] First execution including JIT compilation took 0.33672523498535156s.
+[triton-dejavu] First execution including JIT compilation took 0.2958707809448242s.
+[triton-dejavu] First execution including JIT compilation took 0.24780011177062988s.
+[triton-dejavu] First execution including JIT compilation took 0.3855319023132324s.
+[triton-dejavu] First execution including JIT compilation took 0.31192684173583984s.
+[triton-dejavu] First execution including JIT compilation took 0.2505671977996826s.
+[triton-dejavu] First execution including JIT compilation took 0.505831241607666s.
+[triton-dejavu] First execution including JIT compilation took 0.3260018825531006s.
+[triton-dejavu] First execution including JIT compilation took 0.26883506774902344s.
+[triton-dejavu] First execution including JIT compilation took 0.3750460147857666s.
+[triton-dejavu] First execution including JIT compilation took 0.28055334091186523s.
+[triton-dejavu] First execution including JIT compilation took 0.22944951057434082s.
+[triton-dejavu] First execution including JIT compilation took 0.39789438247680664s.
+[triton-dejavu] First execution including JIT compilation took 0.29082608222961426s.
+[triton-dejavu] First execution including JIT compilation took 0.27058982849121094s.
+[triton-dejavu] First execution including JIT compilation took 0.4739367961883545s.
+[triton-dejavu] First execution including JIT compilation took 0.3220863342285156s.
+[triton-dejavu] First execution including JIT compilation took 0.26685070991516113s.
+[triton-dejavu] First execution including JIT compilation took 0.5877034664154053s.
+[triton-dejavu] First execution including JIT compilation took 0.3485877513885498s.
+[triton-dejavu] First execution including JIT compilation took 0.3099343776702881s.
+[triton-dejavu] First execution including JIT compilation took 0.5288457870483398s.
+[triton-dejavu] First execution including JIT compilation took 0.37487125396728516s.
+[triton-dejavu] First execution including JIT compilation took 0.291426420211792s.
+[triton-dejavu] First execution including JIT compilation took 0.5929708480834961s.
+[triton-dejavu] First execution including JIT compilation took 0.39226531982421875s.
+[triton-dejavu] First execution including JIT compilation took 0.30398011207580566s.
+[triton-dejavu] First execution including JIT compilation took 0.6647982597351074s.
+[triton-dejavu] First execution including JIT compilation took 0.42576146125793457s.
+[triton-dejavu] First execution including JIT compilation took 0.35259008407592773s.
+[triton-dejavu] First execution including JIT compilation took 0.6395070552825928s.
+[triton-dejavu] First execution including JIT compilation took 0.3742716312408447s.
+[triton-dejavu] First execution including JIT compilation took 0.2742881774902344s.
+[triton-dejavu] First execution including JIT compilation took 0.7004520893096924s.
+[triton-dejavu] First execution including JIT compilation took 0.3862569332122803s.
+[triton-dejavu] First execution including JIT compilation took 0.2986783981323242s.
+[triton-dejavu] First execution including JIT compilation took 1.1238834857940674s.
+[triton-dejavu] First execution including JIT compilation took 0.4608597755432129s.
+[triton-dejavu] First execution including JIT compilation took 0.34810447692871094s.
+[triton-dejavu] First execution including JIT compilation took 1.2104661464691162s.
+[triton-dejavu] First execution including JIT compilation took 0.4885828495025635s.
+[triton-dejavu] First execution including JIT compilation took 0.32688188552856445s.
+[triton-dejavu] First execution including JIT compilation took 1.2628145217895508s.
+[triton-dejavu] First execution including JIT compilation took 0.5355226993560791s.
+[triton-dejavu] First execution including JIT compilation took 0.3450770378112793s.
+[triton-dejavu] First execution including JIT compilation took 1.3520443439483643s.
+[triton-dejavu] First execution including JIT compilation took 0.5808374881744385s.
+[triton-dejavu] First execution including JIT compilation took 0.3824028968811035s.
+bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.3000195026397705s.
+[triton-dejavu] First execution including JIT compilation took 0.6455490589141846s.
+[triton-dejavu] First execution including JIT compilation took 0.37666845321655273s.
+[triton-dejavu] First execution including JIT compilation took 1.3013911247253418s.
+[triton-dejavu] First execution including JIT compilation took 0.6704864501953125s.
+[triton-dejavu] First execution including JIT compilation took 0.4055519104003906s.
+[triton-dejavu] First execution including JIT compilation took 5.386528968811035s.
+[triton-dejavu] First execution including JIT compilation took 1.1679251194000244s.
+[triton-dejavu] First execution including JIT compilation took 0.5064456462860107s.
+[triton-dejavu] First execution including JIT compilation took 5.529943466186523s.
+[triton-dejavu] First execution including JIT compilation took 1.0479810237884521s.
+[triton-dejavu] First execution including JIT compilation took 0.61952805519104s.
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.3253762722015381s.
+[triton-dejavu] First execution including JIT compilation took 0.25934815406799316s.
+[triton-dejavu] First execution including JIT compilation took 0.21407032012939453s.
+[triton-dejavu] First execution including JIT compilation took 0.3370664119720459s.
+[triton-dejavu] First execution including JIT compilation took 0.28847193717956543s.
+[triton-dejavu] First execution including JIT compilation took 0.23705148696899414s.
+[triton-dejavu] First execution including JIT compilation took 0.3955824375152588s.
+[triton-dejavu] First execution including JIT compilation took 0.3063540458679199s.
+[triton-dejavu] First execution including JIT compilation took 0.25420117378234863s.
+[triton-dejavu] First execution including JIT compilation took 0.4245755672454834s.
+[triton-dejavu] First execution including JIT compilation took 0.32430315017700195s.
+[triton-dejavu] First execution including JIT compilation took 0.269136905670166s.
+[triton-dejavu] First execution including JIT compilation took 0.4198753833770752s.
+[triton-dejavu] First execution including JIT compilation took 0.3307523727416992s.
+[triton-dejavu] First execution including JIT compilation took 0.2702212333679199s.
+[triton-dejavu] First execution including JIT compilation took 0.43901705741882324s.
+[triton-dejavu] First execution including JIT compilation took 0.32950735092163086s.
+[triton-dejavu] First execution including JIT compilation took 0.270906925201416s.
+[triton-dejavu] First execution including JIT compilation took 0.4650428295135498s.
+[triton-dejavu] First execution including JIT compilation took 0.35302281379699707s.
+[triton-dejavu] First execution including JIT compilation took 0.2779378890991211s.
+[triton-dejavu] First execution including JIT compilation took 0.4243738651275635s.
+[triton-dejavu] First execution including JIT compilation took 0.35367321968078613s.
+[triton-dejavu] First execution including JIT compilation took 0.24160146713256836s.
+[triton-dejavu] First execution including JIT compilation took 0.4461050033569336s.
+[triton-dejavu] First execution including JIT compilation took 0.313490629196167s.
+[triton-dejavu] First execution including JIT compilation took 0.25301170349121094s.
+[triton-dejavu] First execution including JIT compilation took 0.49121928215026855s.
+[triton-dejavu] First execution including JIT compilation took 0.34533190727233887s.
+[triton-dejavu] First execution including JIT compilation took 0.2868044376373291s.
+[triton-dejavu] First execution including JIT compilation took 0.5529248714447021s.
+[triton-dejavu] First execution including JIT compilation took 0.36022305488586426s.
+[triton-dejavu] First execution including JIT compilation took 0.2978227138519287s.
+[triton-dejavu] First execution including JIT compilation took 0.5883200168609619s.
+[triton-dejavu] First execution including JIT compilation took 0.3919060230255127s.
+[triton-dejavu] First execution including JIT compilation took 0.3099555969238281s.
+[triton-dejavu] First execution including JIT compilation took 0.6135001182556152s.
+[triton-dejavu] First execution including JIT compilation took 0.4088590145111084s.
+[triton-dejavu] First execution including JIT compilation took 0.3194131851196289s.
+[triton-dejavu] First execution including JIT compilation took 0.7036430835723877s.
+[triton-dejavu] First execution including JIT compilation took 0.45201659202575684s.
+[triton-dejavu] First execution including JIT compilation took 0.3642005920410156s.
+[triton-dejavu] First execution including JIT compilation took 0.6664960384368896s.
+[triton-dejavu] First execution including JIT compilation took 0.5301604270935059s.
+[triton-dejavu] First execution including JIT compilation took 0.28510594367980957s.
+[triton-dejavu] First execution including JIT compilation took 0.6785211563110352s.
+[triton-dejavu] First execution including JIT compilation took 0.4087221622467041s.
+[triton-dejavu] First execution including JIT compilation took 0.3024318218231201s.
+[triton-dejavu] First execution including JIT compilation took 1.157663106918335s.
+[triton-dejavu] First execution including JIT compilation took 0.47170138359069824s.
+[triton-dejavu] First execution including JIT compilation took 0.32757043838500977s.
+[triton-dejavu] First execution including JIT compilation took 1.209216833114624s.
+[triton-dejavu] First execution including JIT compilation took 0.49109315872192383s.
+[triton-dejavu] First execution including JIT compilation took 0.33685898780822754s.
+[triton-dejavu] First execution including JIT compilation took 1.275850772857666s.
+[triton-dejavu] First execution including JIT compilation took 4.450741529464722s.
+[triton-dejavu] First execution including JIT compilation took 0.3839271068572998s.
+[triton-dejavu] First execution including JIT compilation took 1.3883130550384521s.
+[triton-dejavu] First execution including JIT compilation took 0.6234548091888428s.
+[triton-dejavu] First execution including JIT compilation took 0.43517351150512695s.
+bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.4472169876098633s.
+[triton-dejavu] First execution including JIT compilation took 0.6626391410827637s.
+[triton-dejavu] First execution including JIT compilation took 0.42397499084472656s.
+[triton-dejavu] First execution including JIT compilation took 1.535116195678711s.
+[triton-dejavu] First execution including JIT compilation took 0.6942274570465088s.
+[triton-dejavu] First execution including JIT compilation took 0.4427521228790283s.
+[triton-dejavu] First execution including JIT compilation took 5.385616302490234s.
+[triton-dejavu] First execution including JIT compilation took 1.1808125972747803s.
+[triton-dejavu] First execution including JIT compilation took 0.470653772354126s.
+[triton-dejavu] First execution including JIT compilation took 5.212445974349976s.
+[triton-dejavu] First execution including JIT compilation took 1.0829904079437256s.
+[triton-dejavu] First execution including JIT compilation took 0.42938661575317383s.
+bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.89522385597229s.
+[triton-dejavu] First execution including JIT compilation took 1.1873795986175537s.
+[triton-dejavu] First execution including JIT compilation took 0.5484645366668701s.
+[triton-dejavu] First execution including JIT compilation took 3.8715012073516846s.
+[triton-dejavu] First execution including JIT compilation took 1.5014572143554688s.
+[triton-dejavu] First execution including JIT compilation took 0.5261876583099365s.
+bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.16099905967712402s.
+[triton-dejavu] First execution including JIT compilation took 0.16479063034057617s.
+[triton-dejavu] First execution including JIT compilation took 0.16184067726135254s.
+[triton-dejavu] First execution including JIT compilation took 0.18947720527648926s.
+[triton-dejavu] First execution including JIT compilation took 0.17523646354675293s.
+[triton-dejavu] First execution including JIT compilation took 0.20532774925231934s.
+[triton-dejavu] First execution including JIT compilation took 0.3007838726043701s.
+[triton-dejavu] First execution including JIT compilation took 0.2070615291595459s.
+[triton-dejavu] First execution including JIT compilation took 0.19569897651672363s.
+[triton-dejavu] First execution including JIT compilation took 0.22059941291809082s.
+[triton-dejavu] First execution including JIT compilation took 0.2129993438720703s.
+[triton-dejavu] First execution including JIT compilation took 0.19128966331481934s.
+[triton-dejavu] First execution including JIT compilation took 0.23188138008117676s.
+[triton-dejavu] First execution including JIT compilation took 0.23681974411010742s.
+[triton-dejavu] First execution including JIT compilation took 0.20053791999816895s.
+[triton-dejavu] First execution including JIT compilation took 0.23547863960266113s.
+[triton-dejavu] First execution including JIT compilation took 0.24089622497558594s.
+[triton-dejavu] First execution including JIT compilation took 0.20086455345153809s.
+[triton-dejavu] First execution including JIT compilation took 0.2673182487487793s.
+[triton-dejavu] First execution including JIT compilation took 0.2323765754699707s.
+[triton-dejavu] First execution including JIT compilation took 0.20696759223937988s.
+[triton-dejavu] First execution including JIT compilation took 0.2162766456604004s.
+[triton-dejavu] First execution including JIT compilation took 0.19398021697998047s.
+[triton-dejavu] First execution including JIT compilation took 0.19721055030822754s.
+[triton-dejavu] First execution including JIT compilation took 0.2597033977508545s.
+[triton-dejavu] First execution including JIT compilation took 0.21506524085998535s.
+[triton-dejavu] First execution including JIT compilation took 0.19923901557922363s.
+[triton-dejavu] First execution including JIT compilation took 0.23490643501281738s.
+[triton-dejavu] First execution including JIT compilation took 0.21814656257629395s.
+[triton-dejavu] First execution including JIT compilation took 0.2114255428314209s.
+[triton-dejavu] First execution including JIT compilation took 0.25490689277648926s.
+[triton-dejavu] First execution including JIT compilation took 0.23802495002746582s.
+[triton-dejavu] First execution including JIT compilation took 0.2209463119506836s.
+[triton-dejavu] First execution including JIT compilation took 0.26262378692626953s.
+[triton-dejavu] First execution including JIT compilation took 0.24033474922180176s.
+[triton-dejavu] First execution including JIT compilation took 0.22833895683288574s.
+[triton-dejavu] First execution including JIT compilation took 0.27592968940734863s.
+[triton-dejavu] First execution including JIT compilation took 0.30753660202026367s.
+[triton-dejavu] First execution including JIT compilation took 0.24009227752685547s.
+[triton-dejavu] First execution including JIT compilation took 0.3198988437652588s.
+[triton-dejavu] First execution including JIT compilation took 0.2536334991455078s.
+[triton-dejavu] First execution including JIT compilation took 0.24361896514892578s.
+[triton-dejavu] First execution including JIT compilation took 0.25406312942504883s.
+[triton-dejavu] First execution including JIT compilation took 0.20447945594787598s.
+[triton-dejavu] First execution including JIT compilation took 0.19324684143066406s.
+[triton-dejavu] First execution including JIT compilation took 0.2655496597290039s.
+[triton-dejavu] First execution including JIT compilation took 0.2157602310180664s.
+[triton-dejavu] First execution including JIT compilation took 0.21718740463256836s.
+[triton-dejavu] First execution including JIT compilation took 0.2685074806213379s.
+[triton-dejavu] First execution including JIT compilation took 0.22944235801696777s.
+[triton-dejavu] First execution including JIT compilation took 0.26235318183898926s.
+[triton-dejavu] First execution including JIT compilation took 0.2784006595611572s.
+[triton-dejavu] First execution including JIT compilation took 0.2505967617034912s.
+[triton-dejavu] First execution including JIT compilation took 0.2174668312072754s.
+[triton-dejavu] First execution including JIT compilation took 0.3032703399658203s.
+[triton-dejavu] First execution including JIT compilation took 0.24589204788208008s.
+[triton-dejavu] First execution including JIT compilation took 0.24153470993041992s.
+[triton-dejavu] First execution including JIT compilation took 0.3123292922973633s.
+[triton-dejavu] First execution including JIT compilation took 0.25666284561157227s.
+[triton-dejavu] First execution including JIT compilation took 0.24495959281921387s.
+[triton-dejavu] First execution including JIT compilation took 0.40181756019592285s.
+[triton-dejavu] First execution including JIT compilation took 0.2874319553375244s.
+[triton-dejavu] First execution including JIT compilation took 0.26238536834716797s.
+[triton-dejavu] First execution including JIT compilation took 0.29593992233276367s.
+[triton-dejavu] First execution including JIT compilation took 0.2483835220336914s.
+[triton-dejavu] First execution including JIT compilation took 0.21873855590820312s.
+[triton-dejavu] First execution including JIT compilation took 0.3240337371826172s.
+[triton-dejavu] First execution including JIT compilation took 0.27186155319213867s.
+[triton-dejavu] First execution including JIT compilation took 0.2367856502532959s.
+[triton-dejavu] First execution including JIT compilation took 0.36632680892944336s.
+[triton-dejavu] First execution including JIT compilation took 0.28855419158935547s.
+[triton-dejavu] First execution including JIT compilation took 0.252063512802124s.
+[triton-dejavu] First execution including JIT compilation took 0.3803398609161377s.
+[triton-dejavu] First execution including JIT compilation took 0.3005537986755371s.
+[triton-dejavu] First execution including JIT compilation took 0.2524375915527344s.
+[triton-dejavu] First execution including JIT compilation took 0.39989233016967773s.
+[triton-dejavu] First execution including JIT compilation took 0.30489444732666016s.
+[triton-dejavu] First execution including JIT compilation took 0.3460044860839844s.
+[triton-dejavu] First execution including JIT compilation took 0.4346785545349121s.
+[triton-dejavu] First execution including JIT compilation took 0.31057190895080566s.
+[triton-dejavu] First execution including JIT compilation took 0.2753105163574219s.
+[triton-dejavu] First execution including JIT compilation took 0.4804239273071289s.
+[triton-dejavu] First execution including JIT compilation took 0.6045436859130859s.
+[triton-dejavu] First execution including JIT compilation took 0.5239622592926025s.
+[triton-dejavu] First execution including JIT compilation took 0.42328929901123047s.
+[triton-dejavu] First execution including JIT compilation took 0.29734230041503906s.
+[triton-dejavu] First execution including JIT compilation took 0.25474023818969727s.
+[triton-dejavu] First execution including JIT compilation took 0.4383995532989502s.
+[triton-dejavu] First execution including JIT compilation took 0.317385196685791s.
+[triton-dejavu] First execution including JIT compilation took 0.2737693786621094s.
+[triton-dejavu] First execution including JIT compilation took 0.5553841590881348s.
+[triton-dejavu] First execution including JIT compilation took 0.35834789276123047s.
+[triton-dejavu] First execution including JIT compilation took 0.33359360694885254s.
+[triton-dejavu] First execution including JIT compilation took 0.6015679836273193s.
+[triton-dejavu] First execution including JIT compilation took 0.3901402950286865s.
+[triton-dejavu] First execution including JIT compilation took 0.5647265911102295s.
+[triton-dejavu] First execution including JIT compilation took 0.6674647331237793s.
+[triton-dejavu] First execution including JIT compilation took 0.4167792797088623s.
+[triton-dejavu] First execution including JIT compilation took 0.3413257598876953s.
+[triton-dejavu] First execution including JIT compilation took 0.7596611976623535s.
+[triton-dejavu] First execution including JIT compilation took 0.43511199951171875s.
+[triton-dejavu] First execution including JIT compilation took 0.3729708194732666s.
+[triton-dejavu] First execution including JIT compilation took 0.7846958637237549s.
+[triton-dejavu] First execution including JIT compilation took 0.6946334838867188s.
+[triton-dejavu] First execution including JIT compilation took 0.433488130569458s.
+[triton-dejavu] First execution including JIT compilation took 0.4143795967102051s.
+[triton-dejavu] First execution including JIT compilation took 0.19273090362548828s.
+[triton-dejavu] First execution including JIT compilation took 0.1894831657409668s.
+[triton-dejavu] First execution including JIT compilation took 0.21772050857543945s.
+[triton-dejavu] First execution including JIT compilation took 0.20870161056518555s.
+[triton-dejavu] First execution including JIT compilation took 0.19836974143981934s.
+[triton-dejavu] First execution including JIT compilation took 0.23154735565185547s.
+[triton-dejavu] First execution including JIT compilation took 0.20255732536315918s.
+[triton-dejavu] First execution including JIT compilation took 0.20589423179626465s.
+[triton-dejavu] First execution including JIT compilation took 0.23143506050109863s.
+[triton-dejavu] First execution including JIT compilation took 0.2238476276397705s.
+[triton-dejavu] First execution including JIT compilation took 0.22137689590454102s.
+[triton-dejavu] First execution including JIT compilation took 0.23688888549804688s.
+[triton-dejavu] First execution including JIT compilation took 0.22747373580932617s.
+[triton-dejavu] First execution including JIT compilation took 0.21170425415039062s.
+[triton-dejavu] First execution including JIT compilation took 0.25104713439941406s.
+[triton-dejavu] First execution including JIT compilation took 0.23560285568237305s.
+[triton-dejavu] First execution including JIT compilation took 0.22189855575561523s.
+[triton-dejavu] First execution including JIT compilation took 0.26841211318969727s.
+[triton-dejavu] First execution including JIT compilation took 0.24857425689697266s.
+[triton-dejavu] First execution including JIT compilation took 0.24297785758972168s.
+[triton-dejavu] First execution including JIT compilation took 0.22948384284973145s.
+[triton-dejavu] First execution including JIT compilation took 0.1988661289215088s.
+[triton-dejavu] First execution including JIT compilation took 0.1896975040435791s.
+[triton-dejavu] First execution including JIT compilation took 0.24746084213256836s.
+[triton-dejavu] First execution including JIT compilation took 0.2185649871826172s.
+[triton-dejavu] First execution including JIT compilation took 0.2041003704071045s.
+[triton-dejavu] First execution including JIT compilation took 0.2650284767150879s.
+[triton-dejavu] First execution including JIT compilation took 0.29841017723083496s.
+[triton-dejavu] First execution including JIT compilation took 0.22010397911071777s.
+[triton-dejavu] First execution including JIT compilation took 0.29136109352111816s.
+[triton-dejavu] First execution including JIT compilation took 0.24800348281860352s.
+[triton-dejavu] First execution including JIT compilation took 0.22483563423156738s.
+[triton-dejavu] First execution including JIT compilation took 0.27678871154785156s.
+[triton-dejavu] First execution including JIT compilation took 0.24335885047912598s.
+[triton-dejavu] First execution including JIT compilation took 0.23481535911560059s.
+[triton-dejavu] First execution including JIT compilation took 0.288956880569458s.
+[triton-dejavu] First execution including JIT compilation took 0.27408528327941895s.
+[triton-dejavu] First execution including JIT compilation took 0.23581624031066895s.
+[triton-dejavu] First execution including JIT compilation took 0.4908866882324219s.
+[triton-dejavu] First execution including JIT compilation took 0.40228939056396484s.
+[triton-dejavu] First execution including JIT compilation took 0.24046826362609863s.
+[triton-dejavu] First execution including JIT compilation took 0.26389145851135254s.
+[triton-dejavu] First execution including JIT compilation took 0.22148394584655762s.
+[triton-dejavu] First execution including JIT compilation took 0.5583405494689941s.
+[triton-dejavu] First execution including JIT compilation took 0.2856779098510742s.
+[triton-dejavu] First execution including JIT compilation took 0.2353372573852539s.
+[triton-dejavu] First execution including JIT compilation took 0.6925959587097168s.
+[triton-dejavu] First execution including JIT compilation took 0.3485393524169922s.
+[triton-dejavu] First execution including JIT compilation took 0.5607750415802002s.
+[triton-dejavu] First execution including JIT compilation took 0.23984003067016602s.
+[triton-dejavu] First execution including JIT compilation took 0.3213932514190674s.
+[triton-dejavu] First execution including JIT compilation took 0.6764676570892334s.
+[triton-dejavu] First execution including JIT compilation took 0.23536252975463867s.
+[triton-dejavu] First execution including JIT compilation took 0.34651637077331543s.
+[triton-dejavu] First execution including JIT compilation took 0.28470325469970703s.
+[triton-dejavu] First execution including JIT compilation took 0.31414175033569336s.
+[triton-dejavu] First execution including JIT compilation took 0.37757158279418945s.
+[triton-dejavu] First execution including JIT compilation took 0.5653142929077148s.
+[triton-dejavu] First execution including JIT compilation took 0.26725172996520996s.
+[triton-dejavu] First execution including JIT compilation took 0.41627001762390137s.
+[triton-dejavu] First execution including JIT compilation took 0.31640172004699707s.
+[triton-dejavu] First execution including JIT compilation took 0.28881263732910156s.
+[triton-dejavu] First execution including JIT compilation took 0.3590090274810791s.
+[triton-dejavu] First execution including JIT compilation took 0.2639732360839844s.
+[triton-dejavu] First execution including JIT compilation took 0.2286384105682373s.
+[triton-dejavu] First execution including JIT compilation took 0.3730354309082031s.
+[triton-dejavu] First execution including JIT compilation took 0.2863786220550537s.
+[triton-dejavu] First execution including JIT compilation took 0.2942509651184082s.
+[triton-dejavu] First execution including JIT compilation took 0.39931154251098633s.
+[triton-dejavu] First execution including JIT compilation took 0.29895520210266113s.
+[triton-dejavu] First execution including JIT compilation took 0.25076842308044434s.
+[triton-dejavu] First execution including JIT compilation took 0.44957423210144043s.
+[triton-dejavu] First execution including JIT compilation took 0.3334476947784424s.
+[triton-dejavu] First execution including JIT compilation took 0.2527899742126465s.
+[triton-dejavu] First execution including JIT compilation took 0.4636514186859131s.
+[triton-dejavu] First execution including JIT compilation took 0.3361804485321045s.
+[triton-dejavu] First execution including JIT compilation took 0.27211809158325195s.
+[triton-dejavu] First execution including JIT compilation took 0.49650144577026367s.
+[triton-dejavu] First execution including JIT compilation took 0.3559887409210205s.
+[triton-dejavu] First execution including JIT compilation took 0.28218793869018555s.
+[triton-dejavu] First execution including JIT compilation took 0.5615448951721191s.
+[triton-dejavu] First execution including JIT compilation took 0.7608671188354492s.
+[triton-dejavu] First execution including JIT compilation took 0.3274657726287842s.
+[triton-dejavu] First execution including JIT compilation took 0.5771064758300781s.
+[triton-dejavu] First execution including JIT compilation took 0.3666727542877197s.
+[triton-dejavu] First execution including JIT compilation took 0.2816441059112549s.
+[triton-dejavu] First execution including JIT compilation took 0.5920066833496094s.
+[triton-dejavu] First execution including JIT compilation took 0.39211297035217285s.
+[triton-dejavu] First execution including JIT compilation took 0.3664851188659668s.
+[triton-dejavu] First execution including JIT compilation took 0.7597804069519043s.
+[triton-dejavu] First execution including JIT compilation took 0.4436652660369873s.
+[triton-dejavu] First execution including JIT compilation took 0.3279578685760498s.
+[triton-dejavu] First execution including JIT compilation took 0.8084597587585449s.
+[triton-dejavu] First execution including JIT compilation took 0.4614524841308594s.
+[triton-dejavu] First execution including JIT compilation took 0.33185672760009766s.
+[triton-dejavu] First execution including JIT compilation took 0.8718667030334473s.
+[triton-dejavu] First execution including JIT compilation took 0.47515869140625s.
+[triton-dejavu] First execution including JIT compilation took 0.34561920166015625s.
+[triton-dejavu] First execution including JIT compilation took 0.9526774883270264s.
+[triton-dejavu] First execution including JIT compilation took 0.5167005062103271s.
+[triton-dejavu] First execution including JIT compilation took 0.36385536193847656s.
+[triton-dejavu] First execution including JIT compilation took 1.0977909564971924s.
+[triton-dejavu] First execution including JIT compilation took 0.5700552463531494s.
+[triton-dejavu] First execution including JIT compilation took 0.39052820205688477s.
+[triton-dejavu] First execution including JIT compilation took 0.24593067169189453s.
+[triton-dejavu] First execution including JIT compilation took 0.20725393295288086s.
+[triton-dejavu] First execution including JIT compilation took 0.20369458198547363s.
+[triton-dejavu] First execution including JIT compilation took 0.2673497200012207s.
+[triton-dejavu] First execution including JIT compilation took 0.22520208358764648s.
+[triton-dejavu] First execution including JIT compilation took 0.2131955623626709s.
+[triton-dejavu] First execution including JIT compilation took 0.2753727436065674s.
+[triton-dejavu] First execution including JIT compilation took 0.23779654502868652s.
+[triton-dejavu] First execution including JIT compilation took 0.2212817668914795s.
+[triton-dejavu] First execution including JIT compilation took 0.28991055488586426s.
+[triton-dejavu] First execution including JIT compilation took 0.25224971771240234s.
+[triton-dejavu] First execution including JIT compilation took 0.22817182540893555s.
+[triton-dejavu] First execution including JIT compilation took 0.2966287136077881s.
+[triton-dejavu] First execution including JIT compilation took 0.25997209548950195s.
+[triton-dejavu] First execution including JIT compilation took 0.24083781242370605s.
+[triton-dejavu] First execution including JIT compilation took 0.3113217353820801s.
+[triton-dejavu] First execution including JIT compilation took 0.2704799175262451s.
+[triton-dejavu] First execution including JIT compilation took 0.24837803840637207s.
+[triton-dejavu] First execution including JIT compilation took 0.3431241512298584s.
+[triton-dejavu] First execution including JIT compilation took 0.28302574157714844s.
+[triton-dejavu] First execution including JIT compilation took 0.27059054374694824s.
+[triton-dejavu] First execution including JIT compilation took 0.28479981422424316s.
+[triton-dejavu] First execution including JIT compilation took 0.23545360565185547s.
+[triton-dejavu] First execution including JIT compilation took 0.21197509765625s.
+[triton-dejavu] First execution including JIT compilation took 0.29216909408569336s.
+[triton-dejavu] First execution including JIT compilation took 0.27100205421447754s.
+[triton-dejavu] First execution including JIT compilation took 0.21901345252990723s.
+[triton-dejavu] First execution including JIT compilation took 0.33516407012939453s.
+[triton-dejavu] First execution including JIT compilation took 0.24909710884094238s.
+[triton-dejavu] First execution including JIT compilation took 0.22092056274414062s.
+[triton-dejavu] First execution including JIT compilation took 0.3268437385559082s.
+[triton-dejavu] First execution including JIT compilation took 0.2536735534667969s.
+[triton-dejavu] First execution including JIT compilation took 0.22658419609069824s.
+[triton-dejavu] First execution including JIT compilation took 0.0029747486114501953s.
+[triton-dejavu] First execution including JIT compilation took 0.22745442390441895s.
+[triton-dejavu] First execution including JIT compilation took 0.20766067504882812s.
+[triton-dejavu] First execution including JIT compilation took 0.28397655487060547s.
+[triton-dejavu] First execution including JIT compilation took 0.239030122756958s.
+[triton-dejavu] First execution including JIT compilation took 0.2599983215332031s.
+[triton-dejavu] First execution including JIT compilation took 0.298583984375s.
+[triton-dejavu] First execution including JIT compilation took 0.2960634231567383s.
+[triton-dejavu] First execution including JIT compilation took 0.27265357971191406s.
+[triton-dejavu] First execution including JIT compilation took 0.3366870880126953s.
+[triton-dejavu] First execution including JIT compilation took 0.26715946197509766s.
+[triton-dejavu] First execution including JIT compilation took 0.22327065467834473s.
+[triton-dejavu] First execution including JIT compilation took 0.35770249366760254s.
+[triton-dejavu] First execution including JIT compilation took 0.28089475631713867s.
+[triton-dejavu] First execution including JIT compilation took 0.24740338325500488s.
+[triton-dejavu] First execution including JIT compilation took 0.39159536361694336s.
+[triton-dejavu] First execution including JIT compilation took 0.30934739112854004s.
+[triton-dejavu] First execution including JIT compilation took 0.31633591651916504s.
+[triton-dejavu] First execution including JIT compilation took 0.47846007347106934s.
+[triton-dejavu] First execution including JIT compilation took 0.31675219535827637s.
+[triton-dejavu] First execution including JIT compilation took 0.2682924270629883s.
+[triton-dejavu] First execution including JIT compilation took 0.4402353763580322s.
+[triton-dejavu] First execution including JIT compilation took 0.32269787788391113s.
+[triton-dejavu] First execution including JIT compilation took 0.2709183692932129s.
+[triton-dejavu] First execution including JIT compilation took 0.4444851875305176s.
+[triton-dejavu] First execution including JIT compilation took 0.3340129852294922s.
+[triton-dejavu] First execution including JIT compilation took 0.2642478942871094s.
+[triton-dejavu] First execution including JIT compilation took 0.5187058448791504s.
+[triton-dejavu] First execution including JIT compilation took 0.345888614654541s.
+[triton-dejavu] First execution including JIT compilation took 0.2869832515716553s.
+[triton-dejavu] First execution including JIT compilation took 0.4653136730194092s.
+[triton-dejavu] First execution including JIT compilation took 0.318464994430542s.
+[triton-dejavu] First execution including JIT compilation took 0.2446439266204834s.
+[triton-dejavu] First execution including JIT compilation took 0.48433685302734375s.
+[triton-dejavu] First execution including JIT compilation took 0.33881640434265137s.
+[triton-dejavu] First execution including JIT compilation took 0.2716357707977295s.
+[triton-dejavu] First execution including JIT compilation took 0.5825600624084473s.
+[triton-dejavu] First execution including JIT compilation took 0.3634026050567627s.
+[triton-dejavu] First execution including JIT compilation took 0.2927565574645996s.
+[triton-dejavu] First execution including JIT compilation took 0.6513199806213379s.
+[triton-dejavu] First execution including JIT compilation took 0.39306163787841797s.
+[triton-dejavu] First execution including JIT compilation took 0.3288865089416504s.
+[triton-dejavu] First execution including JIT compilation took 0.6803631782531738s.
+[triton-dejavu] First execution including JIT compilation took 0.4358654022216797s.
+[triton-dejavu] First execution including JIT compilation took 0.3263130187988281s.
+[triton-dejavu] First execution including JIT compilation took 0.7428200244903564s.
+[triton-dejavu] First execution including JIT compilation took 0.4704313278198242s.
+[triton-dejavu] First execution including JIT compilation took 0.3472471237182617s.
+[triton-dejavu] First execution including JIT compilation took 0.8439326286315918s.
+[triton-dejavu] First execution including JIT compilation took 0.5137937068939209s.
+[triton-dejavu] First execution including JIT compilation took 0.37453126907348633s.
+[triton-dejavu] First execution including JIT compilation took 0.8335433006286621s.
+[triton-dejavu] First execution including JIT compilation took 0.49039268493652344s.
+[triton-dejavu] First execution including JIT compilation took 0.33686327934265137s.
+[triton-dejavu] First execution including JIT compilation took 0.8961453437805176s.
+[triton-dejavu] First execution including JIT compilation took 0.4983179569244385s.
+[triton-dejavu] First execution including JIT compilation took 0.35771870613098145s.
+[triton-dejavu] First execution including JIT compilation took 1.4264824390411377s.
+[triton-dejavu] First execution including JIT compilation took 0.590933084487915s.
+[triton-dejavu] First execution including JIT compilation took 0.37283968925476074s.
+[triton-dejavu] First execution including JIT compilation took 1.513688564300537s.
+[triton-dejavu] First execution including JIT compilation took 0.6336290836334229s.
+[triton-dejavu] First execution including JIT compilation took 0.3925764560699463s.
+[triton-dejavu] First execution including JIT compilation took 1.6222319602966309s.
+[triton-dejavu] First execution including JIT compilation took 0.6844735145568848s.
+[triton-dejavu] First execution including JIT compilation took 0.4293532371520996s.
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.27669787406921387s.
+[triton-dejavu] First execution including JIT compilation took 0.2200946807861328s.
+[triton-dejavu] First execution including JIT compilation took 0.20387721061706543s.
+[triton-dejavu] First execution including JIT compilation took 0.28987956047058105s.
+[triton-dejavu] First execution including JIT compilation took 0.2356255054473877s.
+[triton-dejavu] First execution including JIT compilation took 0.27132534980773926s.
+[triton-dejavu] First execution including JIT compilation took 0.32961010932922363s.
+[triton-dejavu] First execution including JIT compilation took 0.30097293853759766s.
+[triton-dejavu] First execution including JIT compilation took 0.19495487213134766s.
+[triton-dejavu] First execution including JIT compilation took 0.3256947994232178s.
+[triton-dejavu] First execution including JIT compilation took 0.2637317180633545s.
+[triton-dejavu] First execution including JIT compilation took 0.20687651634216309s.
+[triton-dejavu] First execution including JIT compilation took 0.31104588508605957s.
+[triton-dejavu] First execution including JIT compilation took 0.23851871490478516s.
+[triton-dejavu] First execution including JIT compilation took 0.21181392669677734s.
+[triton-dejavu] First execution including JIT compilation took 0.31918883323669434s.
+[triton-dejavu] First execution including JIT compilation took 0.26523566246032715s.
+[triton-dejavu] First execution including JIT compilation took 0.24065852165222168s.
+[triton-dejavu] First execution including JIT compilation took 0.4031362533569336s.
+[triton-dejavu] First execution including JIT compilation took 0.32692384719848633s.
+[triton-dejavu] First execution including JIT compilation took 0.30884742736816406s.
+[triton-dejavu] First execution including JIT compilation took 0.36347198486328125s.
+[triton-dejavu] First execution including JIT compilation took 0.275341272354126s.
+[triton-dejavu] First execution including JIT compilation took 0.22562313079833984s.
+[triton-dejavu] First execution including JIT compilation took 0.3837006092071533s.
+[triton-dejavu] First execution including JIT compilation took 0.28661417961120605s.
+[triton-dejavu] First execution including JIT compilation took 0.2673346996307373s.
+[triton-dejavu] First execution including JIT compilation took 0.42246246337890625s.
+[triton-dejavu] First execution including JIT compilation took 0.3161001205444336s.
+[triton-dejavu] First execution including JIT compilation took 0.25901246070861816s.
+[triton-dejavu] First execution including JIT compilation took 0.4973328113555908s.
+[triton-dejavu] First execution including JIT compilation took 0.33356618881225586s.
+[triton-dejavu] First execution including JIT compilation took 0.27872180938720703s.
+[triton-dejavu] First execution including JIT compilation took 0.46326756477355957s.
+[triton-dejavu] First execution including JIT compilation took 0.35817837715148926s.
+[triton-dejavu] First execution including JIT compilation took 0.2817208766937256s.
+[triton-dejavu] First execution including JIT compilation took 0.49773097038269043s.
+[triton-dejavu] First execution including JIT compilation took 0.3602900505065918s.
+[triton-dejavu] First execution including JIT compilation took 0.3025212287902832s.
+[triton-dejavu] First execution including JIT compilation took 0.5235435962677002s.
+[triton-dejavu] First execution including JIT compilation took 0.3942751884460449s.
+[triton-dejavu] First execution including JIT compilation took 0.3084683418273926s.
+[triton-dejavu] First execution including JIT compilation took 0.4975898265838623s.
+[triton-dejavu] First execution including JIT compilation took 0.3797109127044678s.
+[triton-dejavu] First execution including JIT compilation took 0.30298733711242676s.
+[triton-dejavu] First execution including JIT compilation took 0.5086245536804199s.
+[triton-dejavu] First execution including JIT compilation took 0.3442721366882324s.
+[triton-dejavu] First execution including JIT compilation took 0.2747983932495117s.
+[triton-dejavu] First execution including JIT compilation took 0.5613915920257568s.
+[triton-dejavu] First execution including JIT compilation took 0.44350624084472656s.
+[triton-dejavu] First execution including JIT compilation took 0.28230857849121094s.
+[triton-dejavu] First execution including JIT compilation took 0.6058351993560791s.
+[triton-dejavu] First execution including JIT compilation took 0.38971829414367676s.
+[triton-dejavu] First execution including JIT compilation took 0.3060598373413086s.
+[triton-dejavu] First execution including JIT compilation took 0.6243553161621094s.
+[triton-dejavu] First execution including JIT compilation took 0.4053328037261963s.
+[triton-dejavu] First execution including JIT compilation took 0.3107168674468994s.
+[triton-dejavu] First execution including JIT compilation took 0.6879723072052002s.
+[triton-dejavu] First execution including JIT compilation took 0.4307887554168701s.
+[triton-dejavu] First execution including JIT compilation took 0.3229548931121826s.
+[triton-dejavu] First execution including JIT compilation took 0.7918972969055176s.
+[triton-dejavu] First execution including JIT compilation took 0.48616766929626465s.
+[triton-dejavu] First execution including JIT compilation took 0.36690473556518555s.
+[triton-dejavu] First execution including JIT compilation took 0.8196022510528564s.
+[triton-dejavu] First execution including JIT compilation took 0.4654698371887207s.
+[triton-dejavu] First execution including JIT compilation took 0.3202695846557617s.
+[triton-dejavu] First execution including JIT compilation took 0.7949032783508301s.
+[triton-dejavu] First execution including JIT compilation took 0.4816138744354248s.
+[triton-dejavu] First execution including JIT compilation took 0.3490898609161377s.
+[triton-dejavu] First execution including JIT compilation took 1.313990831375122s.
+[triton-dejavu] First execution including JIT compilation took 0.6957395076751709s.
+[triton-dejavu] First execution including JIT compilation took 0.36670374870300293s.
+[triton-dejavu] First execution including JIT compilation took 1.390000581741333s.
+[triton-dejavu] First execution including JIT compilation took 0.5718057155609131s.
+[triton-dejavu] First execution including JIT compilation took 0.4198739528656006s.
+[triton-dejavu] First execution including JIT compilation took 1.4662723541259766s.
+[triton-dejavu] First execution including JIT compilation took 0.6175658702850342s.
+[triton-dejavu] First execution including JIT compilation took 0.398082971572876s.
+[triton-dejavu] First execution including JIT compilation took 1.5167500972747803s.
+[triton-dejavu] First execution including JIT compilation took 0.6479494571685791s.
+[triton-dejavu] First execution including JIT compilation took 0.41569066047668457s.
+bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.4222400188446045s.
+[triton-dejavu] First execution including JIT compilation took 0.5973012447357178s.
+[triton-dejavu] First execution including JIT compilation took 0.3583037853240967s.
+[triton-dejavu] First execution including JIT compilation took 1.356217861175537s.
+[triton-dejavu] First execution including JIT compilation took 0.6360659599304199s.
+[triton-dejavu] First execution including JIT compilation took 0.4930713176727295s.
+[triton-dejavu] First execution including JIT compilation took 5.760070323944092s.
+[triton-dejavu] First execution including JIT compilation took 1.333890438079834s.
+[triton-dejavu] First execution including JIT compilation took 0.5843362808227539s.
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.4048187732696533s.
+[triton-dejavu] First execution including JIT compilation took 0.2757749557495117s.
+[triton-dejavu] First execution including JIT compilation took 0.22593021392822266s.
+[triton-dejavu] First execution including JIT compilation took 0.3719668388366699s.
+[triton-dejavu] First execution including JIT compilation took 0.2907881736755371s.
+[triton-dejavu] First execution including JIT compilation took 0.22260475158691406s.
+[triton-dejavu] First execution including JIT compilation took 0.3940160274505615s.
+[triton-dejavu] First execution including JIT compilation took 0.29627037048339844s.
+[triton-dejavu] First execution including JIT compilation took 0.23922204971313477s.
+[triton-dejavu] First execution including JIT compilation took 0.4309115409851074s.
+[triton-dejavu] First execution including JIT compilation took 0.3773322105407715s.
+[triton-dejavu] First execution including JIT compilation took 0.31682252883911133s.
+[triton-dejavu] First execution including JIT compilation took 0.4817657470703125s.
+[triton-dejavu] First execution including JIT compilation took 0.30999183654785156s.
+[triton-dejavu] First execution including JIT compilation took 0.2649409770965576s.
+[triton-dejavu] First execution including JIT compilation took 0.4654359817504883s.
+[triton-dejavu] First execution including JIT compilation took 0.3404858112335205s.
+[triton-dejavu] First execution including JIT compilation took 0.2549777030944824s.
+[triton-dejavu] First execution including JIT compilation took 0.5529801845550537s.
+[triton-dejavu] First execution including JIT compilation took 0.39357733726501465s.
+[triton-dejavu] First execution including JIT compilation took 0.2779700756072998s.
+[triton-dejavu] First execution including JIT compilation took 0.4679603576660156s.
+[triton-dejavu] First execution including JIT compilation took 0.3258848190307617s.
+[triton-dejavu] First execution including JIT compilation took 0.22054314613342285s.
+[triton-dejavu] First execution including JIT compilation took 0.5082552433013916s.
+[triton-dejavu] First execution including JIT compilation took 0.33693814277648926s.
+[triton-dejavu] First execution including JIT compilation took 0.2745835781097412s.
+[triton-dejavu] First execution including JIT compilation took 0.5847163200378418s.
+[triton-dejavu] First execution including JIT compilation took 0.33575940132141113s.
+[triton-dejavu] First execution including JIT compilation took 0.3060939311981201s.
+[triton-dejavu] First execution including JIT compilation took 0.5239126682281494s.
+[triton-dejavu] First execution including JIT compilation took 0.35081052780151367s.
+[triton-dejavu] First execution including JIT compilation took 0.2809262275695801s.
+[triton-dejavu] First execution including JIT compilation took 0.5589377880096436s.
+[triton-dejavu] First execution including JIT compilation took 0.36190342903137207s.
+[triton-dejavu] First execution including JIT compilation took 0.27885007858276367s.
+[triton-dejavu] First execution including JIT compilation took 0.6101348400115967s.
+[triton-dejavu] First execution including JIT compilation took 0.4172549247741699s.
+[triton-dejavu] First execution including JIT compilation took 0.3286736011505127s.
+[triton-dejavu] First execution including JIT compilation took 0.6457531452178955s.
+[triton-dejavu] First execution including JIT compilation took 0.39678049087524414s.
+[triton-dejavu] First execution including JIT compilation took 0.30982041358947754s.
+[triton-dejavu] First execution including JIT compilation took 0.6744742393493652s.
+[triton-dejavu] First execution including JIT compilation took 0.42897796630859375s.
+[triton-dejavu] First execution including JIT compilation took 0.26523256301879883s.
+[triton-dejavu] First execution including JIT compilation took 0.7656145095825195s.
+[triton-dejavu] First execution including JIT compilation took 0.40720272064208984s.
+[triton-dejavu] First execution including JIT compilation took 0.28449296951293945s.
+[triton-dejavu] First execution including JIT compilation took 1.1293773651123047s.
+[triton-dejavu] First execution including JIT compilation took 0.5252115726470947s.
+[triton-dejavu] First execution including JIT compilation took 0.3610687255859375s.
+[triton-dejavu] First execution including JIT compilation took 1.4119246006011963s.
+[triton-dejavu] First execution including JIT compilation took 0.6041393280029297s.
+[triton-dejavu] First execution including JIT compilation took 0.3884885311126709s.
+[triton-dejavu] First execution including JIT compilation took 1.526637315750122s.
+[triton-dejavu] First execution including JIT compilation took 0.6266424655914307s.
+[triton-dejavu] First execution including JIT compilation took 0.40192389488220215s.
+[triton-dejavu] First execution including JIT compilation took 1.530601978302002s.
+[triton-dejavu] First execution including JIT compilation took 0.6515090465545654s.
+[triton-dejavu] First execution including JIT compilation took 0.42690610885620117s.
+bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.7079706192016602s.
+[triton-dejavu] First execution including JIT compilation took 0.7440791130065918s.
+[triton-dejavu] First execution including JIT compilation took 0.4444162845611572s.
+[triton-dejavu] First execution including JIT compilation took 1.7328886985778809s.
+[triton-dejavu] First execution including JIT compilation took 0.7971758842468262s.
+[triton-dejavu] First execution including JIT compilation took 0.47760677337646484s.
+[triton-dejavu] First execution including JIT compilation took 5.828885316848755s.
+[triton-dejavu] First execution including JIT compilation took 1.288949966430664s.
+[triton-dejavu] First execution including JIT compilation took 0.5151238441467285s.
+[triton-dejavu] First execution including JIT compilation took 5.60798192024231s.
+[triton-dejavu] First execution including JIT compilation took 1.3259506225585938s.
+[triton-dejavu] First execution including JIT compilation took 0.5839335918426514s.
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 3.490701913833618s.
+[triton-dejavu] First execution including JIT compilation took 1.3819916248321533s.
+[triton-dejavu] First execution including JIT compilation took 0.681626558303833s.
+[triton-dejavu] First execution including JIT compilation took 4.5987389087677s.
+[triton-dejavu] First execution including JIT compilation took 1.3767080307006836s.
+[triton-dejavu] First execution including JIT compilation took 0.6134452819824219s.
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.20271754264831543s.
+[triton-dejavu] First execution including JIT compilation took 0.17133116722106934s.
+[triton-dejavu] First execution including JIT compilation took 0.1536731719970703s.
+[triton-dejavu] First execution including JIT compilation took 0.19096851348876953s.
+[triton-dejavu] First execution including JIT compilation took 0.17846441268920898s.
+[triton-dejavu] First execution including JIT compilation took 0.1891782283782959s.
+[triton-dejavu] First execution including JIT compilation took 0.23286938667297363s.
+[triton-dejavu] First execution including JIT compilation took 0.21250367164611816s.
+[triton-dejavu] First execution including JIT compilation took 0.2115018367767334s.
+[triton-dejavu] First execution including JIT compilation took 0.25536513328552246s.
+[triton-dejavu] First execution including JIT compilation took 0.26403188705444336s.
+[triton-dejavu] First execution including JIT compilation took 0.20061635971069336s.
+[triton-dejavu] First execution including JIT compilation took 0.27070093154907227s.
+[triton-dejavu] First execution including JIT compilation took 0.2410728931427002s.
+[triton-dejavu] First execution including JIT compilation took 0.2050936222076416s.
+[triton-dejavu] First execution including JIT compilation took 0.28530001640319824s.
+[triton-dejavu] First execution including JIT compilation took 0.24301719665527344s.
+[triton-dejavu] First execution including JIT compilation took 0.2149193286895752s.
+[triton-dejavu] First execution including JIT compilation took 0.29409146308898926s.
+[triton-dejavu] First execution including JIT compilation took 0.2827568054199219s.
+[triton-dejavu] First execution including JIT compilation took 0.2236497402191162s.
+[triton-dejavu] First execution including JIT compilation took 0.2511780261993408s.
+[triton-dejavu] First execution including JIT compilation took 0.21310901641845703s.
+[triton-dejavu] First execution including JIT compilation took 0.19583463668823242s.
+[triton-dejavu] First execution including JIT compilation took 0.270064115524292s.
+[triton-dejavu] First execution including JIT compilation took 0.2317502498626709s.
+[triton-dejavu] First execution including JIT compilation took 0.2144770622253418s.
+[triton-dejavu] First execution including JIT compilation took 0.28452420234680176s.
+[triton-dejavu] First execution including JIT compilation took 0.24449563026428223s.
+[triton-dejavu] First execution including JIT compilation took 0.2711045742034912s.
+[triton-dejavu] First execution including JIT compilation took 0.30714941024780273s.
+[triton-dejavu] First execution including JIT compilation took 0.2577242851257324s.
+[triton-dejavu] First execution including JIT compilation took 0.23275232315063477s.
+[triton-dejavu] First execution including JIT compilation took 0.32830142974853516s.
+[triton-dejavu] First execution including JIT compilation took 0.25277233123779297s.
+[triton-dejavu] First execution including JIT compilation took 0.23861432075500488s.
+[triton-dejavu] First execution including JIT compilation took 0.31818604469299316s.
+[triton-dejavu] First execution including JIT compilation took 0.26758432388305664s.
+[triton-dejavu] First execution including JIT compilation took 0.2486262321472168s.
+[triton-dejavu] First execution including JIT compilation took 0.3456125259399414s.
+[triton-dejavu] First execution including JIT compilation took 0.33374500274658203s.
+[triton-dejavu] First execution including JIT compilation took 0.2485215663909912s.
+[triton-dejavu] First execution including JIT compilation took 0.30871033668518066s.
+[triton-dejavu] First execution including JIT compilation took 0.22252321243286133s.
+[triton-dejavu] First execution including JIT compilation took 0.20645499229431152s.
+[triton-dejavu] First execution including JIT compilation took 0.3251798152923584s.
+[triton-dejavu] First execution including JIT compilation took 0.2487037181854248s.
+[triton-dejavu] First execution including JIT compilation took 0.22485733032226562s.
+[triton-dejavu] First execution including JIT compilation took 0.33643627166748047s.
+[triton-dejavu] First execution including JIT compilation took 0.2661266326904297s.
+[triton-dejavu] First execution including JIT compilation took 0.2295246124267578s.
+[triton-dejavu] First execution including JIT compilation took 0.38455843925476074s.
+[triton-dejavu] First execution including JIT compilation took 0.2738194465637207s.
+[triton-dejavu] First execution including JIT compilation took 0.24585938453674316s.
+[triton-dejavu] First execution including JIT compilation took 0.4033064842224121s.
+[triton-dejavu] First execution including JIT compilation took 0.2825932502746582s.
+[triton-dejavu] First execution including JIT compilation took 0.2537994384765625s.
+[triton-dejavu] First execution including JIT compilation took 0.4199497699737549s.
+[triton-dejavu] First execution including JIT compilation took 0.2951374053955078s.
+[triton-dejavu] First execution including JIT compilation took 0.2603449821472168s.
+[triton-dejavu] First execution including JIT compilation took 0.4944791793823242s.
+[triton-dejavu] First execution including JIT compilation took 0.3230445384979248s.
+[triton-dejavu] First execution including JIT compilation took 0.29880690574645996s.
+[triton-dejavu] First execution including JIT compilation took 0.4109377861022949s.
+[triton-dejavu] First execution including JIT compilation took 0.27936363220214844s.
+[triton-dejavu] First execution including JIT compilation took 0.23092174530029297s.
+[triton-dejavu] First execution including JIT compilation took 0.428159236907959s.
+[triton-dejavu] First execution including JIT compilation took 0.2879374027252197s.
+[triton-dejavu] First execution including JIT compilation took 0.2565889358520508s.
+[triton-dejavu] First execution including JIT compilation took 0.5160079002380371s.
+[triton-dejavu] First execution including JIT compilation took 0.31639909744262695s.
+[triton-dejavu] First execution including JIT compilation took 0.2591373920440674s.
+[triton-dejavu] First execution including JIT compilation took 0.5452303886413574s.
+[triton-dejavu] First execution including JIT compilation took 0.3242976665496826s.
+[triton-dejavu] First execution including JIT compilation took 0.2623326778411865s.
+[triton-dejavu] First execution including JIT compilation took 0.5922431945800781s.
+[triton-dejavu] First execution including JIT compilation took 0.34310412406921387s.
+[triton-dejavu] First execution including JIT compilation took 0.27410078048706055s.
+[triton-dejavu] First execution including JIT compilation took 0.6470179557800293s.
+[triton-dejavu] First execution including JIT compilation took 0.3510866165161133s.
+[triton-dejavu] First execution including JIT compilation took 0.2860851287841797s.
+[triton-dejavu] First execution including JIT compilation took 0.7252414226531982s.
+[triton-dejavu] First execution including JIT compilation took 0.4050569534301758s.
+[triton-dejavu] First execution including JIT compilation took 0.30899977684020996s.
+[triton-dejavu] First execution including JIT compilation took 0.6849832534790039s.
+[triton-dejavu] First execution including JIT compilation took 0.4114108085632324s.
+[triton-dejavu] First execution including JIT compilation took 0.26646900177001953s.
+[triton-dejavu] First execution including JIT compilation took 0.731346845626831s.
+[triton-dejavu] First execution including JIT compilation took 0.37401604652404785s.
+[triton-dejavu] First execution including JIT compilation took 0.2981090545654297s.
+[triton-dejavu] First execution including JIT compilation took 1.255406141281128s.
+[triton-dejavu] First execution including JIT compilation took 0.4646761417388916s.
+[triton-dejavu] First execution including JIT compilation took 0.31566929817199707s.
+[triton-dejavu] First execution including JIT compilation took 1.3867464065551758s.
+[triton-dejavu] First execution including JIT compilation took 0.4915659427642822s.
+[triton-dejavu] First execution including JIT compilation took 0.336292028427124s.
+[triton-dejavu] First execution including JIT compilation took 1.4543449878692627s.
+[triton-dejavu] First execution including JIT compilation took 0.5164680480957031s.
+[triton-dejavu] First execution including JIT compilation took 0.35480237007141113s.
+[triton-dejavu] First execution including JIT compilation took 1.5047898292541504s.
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.2522404193878174s.
+[triton-dejavu] First execution including JIT compilation took 0.20596766471862793s.
+[triton-dejavu] First execution including JIT compilation took 0.19162297248840332s.
+[triton-dejavu] First execution including JIT compilation took 0.26123833656311035s.
+[triton-dejavu] First execution including JIT compilation took 0.22704195976257324s.
+[triton-dejavu] First execution including JIT compilation took 0.2119007110595703s.
+[triton-dejavu] First execution including JIT compilation took 0.29358959197998047s.
+[triton-dejavu] First execution including JIT compilation took 0.23833417892456055s.
+[triton-dejavu] First execution including JIT compilation took 0.2248821258544922s.
+[triton-dejavu] First execution including JIT compilation took 0.29287123680114746s.
+[triton-dejavu] First execution including JIT compilation took 0.25087571144104004s.
+[triton-dejavu] First execution including JIT compilation took 0.23653793334960938s.
+[triton-dejavu] First execution including JIT compilation took 0.3085510730743408s.
+[triton-dejavu] First execution including JIT compilation took 0.22148561477661133s.
+[triton-dejavu] First execution including JIT compilation took 0.20212173461914062s.
+[triton-dejavu] First execution including JIT compilation took 0.26024508476257324s.
+[triton-dejavu] First execution including JIT compilation took 0.20862603187561035s.
+[triton-dejavu] First execution including JIT compilation took 0.2089087963104248s.
+[triton-dejavu] First execution including JIT compilation took 0.28104591369628906s.
+[triton-dejavu] First execution including JIT compilation took 0.2973470687866211s.
+[triton-dejavu] First execution including JIT compilation took 0.20228052139282227s.
+[triton-dejavu] First execution including JIT compilation took 0.3051731586456299s.
+[triton-dejavu] First execution including JIT compilation took 0.21213650703430176s.
+[triton-dejavu] First execution including JIT compilation took 0.18593811988830566s.
+[triton-dejavu] First execution including JIT compilation took 0.25663161277770996s.
+[triton-dejavu] First execution including JIT compilation took 0.2507617473602295s.
+[triton-dejavu] First execution including JIT compilation took 0.2245655059814453s.
+[triton-dejavu] First execution including JIT compilation took 0.3247964382171631s.
+[triton-dejavu] First execution including JIT compilation took 0.25675010681152344s.
+[triton-dejavu] First execution including JIT compilation took 0.2267463207244873s.
+[triton-dejavu] First execution including JIT compilation took 0.3354456424713135s.
+[triton-dejavu] First execution including JIT compilation took 0.25986337661743164s.
+[triton-dejavu] First execution including JIT compilation took 0.23006272315979004s.
+[triton-dejavu] First execution including JIT compilation took 0.002689838409423828s.
+[triton-dejavu] First execution including JIT compilation took 0.2550840377807617s.
+[triton-dejavu] First execution including JIT compilation took 0.24457740783691406s.
+[triton-dejavu] First execution including JIT compilation took 0.34949541091918945s.
+[triton-dejavu] First execution including JIT compilation took 0.2756638526916504s.
+[triton-dejavu] First execution including JIT compilation took 0.2527437210083008s.
+[triton-dejavu] First execution including JIT compilation took 0.3787045478820801s.
+[triton-dejavu] First execution including JIT compilation took 0.2873713970184326s.
+[triton-dejavu] First execution including JIT compilation took 0.2592127323150635s.
+[triton-dejavu] First execution including JIT compilation took 0.33814334869384766s.
+[triton-dejavu] First execution including JIT compilation took 0.2517428398132324s.
+[triton-dejavu] First execution including JIT compilation took 0.21767520904541016s.
+[triton-dejavu] First execution including JIT compilation took 0.36879444122314453s.
+[triton-dejavu] First execution including JIT compilation took 0.2698078155517578s.
+[triton-dejavu] First execution including JIT compilation took 0.2365264892578125s.
+[triton-dejavu] First execution including JIT compilation took 0.47873687744140625s.
+[triton-dejavu] First execution including JIT compilation took 0.2871267795562744s.
+[triton-dejavu] First execution including JIT compilation took 0.24500083923339844s.
+[triton-dejavu] First execution including JIT compilation took 0.4963796138763428s.
+[triton-dejavu] First execution including JIT compilation took 0.30948710441589355s.
+[triton-dejavu] First execution including JIT compilation took 0.25137853622436523s.
+[triton-dejavu] First execution including JIT compilation took 0.4584636688232422s.
+[triton-dejavu] First execution including JIT compilation took 0.3162257671356201s.
+[triton-dejavu] First execution including JIT compilation took 0.2994105815887451s.
+[triton-dejavu] First execution including JIT compilation took 0.4786410331726074s.
+[triton-dejavu] First execution including JIT compilation took 0.3190131187438965s.
+[triton-dejavu] First execution including JIT compilation took 0.2704010009765625s.
+[triton-dejavu] First execution including JIT compilation took 0.5149548053741455s.
+[triton-dejavu] First execution including JIT compilation took 0.3378560543060303s.
+[triton-dejavu] First execution including JIT compilation took 0.28589439392089844s.
+[triton-dejavu] First execution including JIT compilation took 0.47048139572143555s.
+[triton-dejavu] First execution including JIT compilation took 0.28485631942749023s.
+[triton-dejavu] First execution including JIT compilation took 0.23804211616516113s.
+[triton-dejavu] First execution including JIT compilation took 0.4914519786834717s.
+[triton-dejavu] First execution including JIT compilation took 0.30657291412353516s.
+[triton-dejavu] First execution including JIT compilation took 0.2527627944946289s.
+[triton-dejavu] First execution including JIT compilation took 0.7375938892364502s.
+[triton-dejavu] First execution including JIT compilation took 0.33797788619995117s.
+[triton-dejavu] First execution including JIT compilation took 0.27010035514831543s.
+[triton-dejavu] First execution including JIT compilation took 0.6535592079162598s.
+[triton-dejavu] First execution including JIT compilation took 0.35745692253112793s.
+[triton-dejavu] First execution including JIT compilation took 0.2837181091308594s.
+[triton-dejavu] First execution including JIT compilation took 0.6975975036621094s.
+[triton-dejavu] First execution including JIT compilation took 0.4016244411468506s.
+[triton-dejavu] First execution including JIT compilation took 0.30004215240478516s.
+[triton-dejavu] First execution including JIT compilation took 0.7542321681976318s.
+[triton-dejavu] First execution including JIT compilation took 0.4112386703491211s.
+[triton-dejavu] First execution including JIT compilation took 0.3136770725250244s.
+[triton-dejavu] First execution including JIT compilation took 0.854001522064209s.
+[triton-dejavu] First execution including JIT compilation took 0.5444228649139404s.
+[triton-dejavu] First execution including JIT compilation took 0.34048891067504883s.
+[triton-dejavu] First execution including JIT compilation took 0.8623373508453369s.
+[triton-dejavu] First execution including JIT compilation took 0.4289731979370117s.
+[triton-dejavu] First execution including JIT compilation took 0.2985663414001465s.
+[triton-dejavu] First execution including JIT compilation took 0.8375389575958252s.
+[triton-dejavu] First execution including JIT compilation took 0.4361135959625244s.
+[triton-dejavu] First execution including JIT compilation took 0.3154265880584717s.
+[triton-dejavu] First execution including JIT compilation took 1.547863483428955s.
+[triton-dejavu] First execution including JIT compilation took 0.511976957321167s.
+[triton-dejavu] First execution including JIT compilation took 0.33377766609191895s.
+[triton-dejavu] First execution including JIT compilation took 1.6245229244232178s.
+[triton-dejavu] First execution including JIT compilation took 0.5486083030700684s.
+[triton-dejavu] First execution including JIT compilation took 0.35009217262268066s.
+[triton-dejavu] First execution including JIT compilation took 1.730604887008667s.
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.31005024909973145s.
+[triton-dejavu] First execution including JIT compilation took 0.2568016052246094s.
+[triton-dejavu] First execution including JIT compilation took 0.21983957290649414s.
+[triton-dejavu] First execution including JIT compilation took 0.3342258930206299s.
+[triton-dejavu] First execution including JIT compilation took 0.2613508701324463s.
+[triton-dejavu] First execution including JIT compilation took 0.23277544975280762s.
+[triton-dejavu] First execution including JIT compilation took 0.34593868255615234s.
+[triton-dejavu] First execution including JIT compilation took 0.27214527130126953s.
+[triton-dejavu] First execution including JIT compilation took 0.24357295036315918s.
+[triton-dejavu] First execution including JIT compilation took 0.3831825256347656s.
+[triton-dejavu] First execution including JIT compilation took 0.2801399230957031s.
+[triton-dejavu] First execution including JIT compilation took 0.28713178634643555s.
+[triton-dejavu] First execution including JIT compilation took 0.3746922016143799s.
+[triton-dejavu] First execution including JIT compilation took 0.29146361351013184s.
+[triton-dejavu] First execution including JIT compilation took 0.25294995307922363s.
+[triton-dejavu] First execution including JIT compilation took 0.3896350860595703s.
+[triton-dejavu] First execution including JIT compilation took 0.3028104305267334s.
+[triton-dejavu] First execution including JIT compilation took 0.2598695755004883s.
+[triton-dejavu] First execution including JIT compilation took 0.4107673168182373s.
+[triton-dejavu] First execution including JIT compilation took 0.3029160499572754s.
+[triton-dejavu] First execution including JIT compilation took 0.27234864234924316s.
+[triton-dejavu] First execution including JIT compilation took 0.3524813652038574s.
+[triton-dejavu] First execution including JIT compilation took 0.2637143135070801s.
+[triton-dejavu] First execution including JIT compilation took 0.21795105934143066s.
+[triton-dejavu] First execution including JIT compilation took 0.36962461471557617s.
+[triton-dejavu] First execution including JIT compilation took 0.2753579616546631s.
+[triton-dejavu] First execution including JIT compilation took 0.24502253532409668s.
+[triton-dejavu] First execution including JIT compilation took 0.38353514671325684s.
+[triton-dejavu] First execution including JIT compilation took 0.25853633880615234s.
+[triton-dejavu] First execution including JIT compilation took 0.23975038528442383s.
+[triton-dejavu] First execution including JIT compilation took 0.0030221939086914062s.
+[triton-dejavu] First execution including JIT compilation took 0.29683613777160645s.
+[triton-dejavu] First execution including JIT compilation took 0.2580904960632324s.
+[triton-dejavu] First execution including JIT compilation took 0.4290771484375s.
+[triton-dejavu] First execution including JIT compilation took 0.3167991638183594s.
+[triton-dejavu] First execution including JIT compilation took 0.2567250728607178s.
+[triton-dejavu] First execution including JIT compilation took 0.44550418853759766s.
+[triton-dejavu] First execution including JIT compilation took 0.3198390007019043s.
+[triton-dejavu] First execution including JIT compilation took 0.268108606338501s.
+[triton-dejavu] First execution including JIT compilation took 0.4916553497314453s.
+[triton-dejavu] First execution including JIT compilation took 0.3439137935638428s.
+[triton-dejavu] First execution including JIT compilation took 0.27727365493774414s.
+[triton-dejavu] First execution including JIT compilation took 0.460857629776001s.
+[triton-dejavu] First execution including JIT compilation took 0.30243563652038574s.
+[triton-dejavu] First execution including JIT compilation took 0.24333858489990234s.
+[triton-dejavu] First execution including JIT compilation took 0.46892428398132324s.
+[triton-dejavu] First execution including JIT compilation took 0.3167304992675781s.
+[triton-dejavu] First execution including JIT compilation took 0.2599649429321289s.
+[triton-dejavu] First execution including JIT compilation took 0.5126926898956299s.
+[triton-dejavu] First execution including JIT compilation took 0.32805609703063965s.
+[triton-dejavu] First execution including JIT compilation took 0.26161670684814453s.
+[triton-dejavu] First execution including JIT compilation took 0.5467493534088135s.
+[triton-dejavu] First execution including JIT compilation took 0.3979170322418213s.
+[triton-dejavu] First execution including JIT compilation took 0.27261829376220703s.
+[triton-dejavu] First execution including JIT compilation took 0.56540846824646s.
+[triton-dejavu] First execution including JIT compilation took 0.35355091094970703s.
+[triton-dejavu] First execution including JIT compilation took 0.276700496673584s.
+[triton-dejavu] First execution including JIT compilation took 0.5869178771972656s.
+[triton-dejavu] First execution including JIT compilation took 0.3624422550201416s.
+[triton-dejavu] First execution including JIT compilation took 0.35153841972351074s.
+[triton-dejavu] First execution including JIT compilation took 0.6571488380432129s.
+[triton-dejavu] First execution including JIT compilation took 0.3958284854888916s.
+[triton-dejavu] First execution including JIT compilation took 0.30527758598327637s.
+[triton-dejavu] First execution including JIT compilation took 0.6626615524291992s.
+[triton-dejavu] First execution including JIT compilation took 0.3544487953186035s.
+[triton-dejavu] First execution including JIT compilation took 0.2698044776916504s.
+[triton-dejavu] First execution including JIT compilation took 0.6961638927459717s.
+[triton-dejavu] First execution including JIT compilation took 0.38259434700012207s.
+[triton-dejavu] First execution including JIT compilation took 0.283905029296875s.
+[triton-dejavu] First execution including JIT compilation took 0.845867395401001s.
+[triton-dejavu] First execution including JIT compilation took 0.4127688407897949s.
+[triton-dejavu] First execution including JIT compilation took 0.3159315586090088s.
+[triton-dejavu] First execution including JIT compilation took 0.9087560176849365s.
+[triton-dejavu] First execution including JIT compilation took 0.4513425827026367s.
+[triton-dejavu] First execution including JIT compilation took 0.3294107913970947s.
+[triton-dejavu] First execution including JIT compilation took 0.9571695327758789s.
+[triton-dejavu] First execution including JIT compilation took 0.4684031009674072s.
+[triton-dejavu] First execution including JIT compilation took 0.32914233207702637s.
+[triton-dejavu] First execution including JIT compilation took 1.0359725952148438s.
+[triton-dejavu] First execution including JIT compilation took 0.48003387451171875s.
+[triton-dejavu] First execution including JIT compilation took 0.34207773208618164s.
+[triton-dejavu] First execution including JIT compilation took 1.1305363178253174s.
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.2873709201812744s.
+[triton-dejavu] First execution including JIT compilation took 0.534212589263916s.
+[triton-dejavu] First execution including JIT compilation took 0.34093737602233887s.
+[triton-dejavu] First execution including JIT compilation took 1.2213225364685059s.
+[triton-dejavu] First execution including JIT compilation took 0.5500822067260742s.
+[triton-dejavu] First execution including JIT compilation took 0.3482015132904053s.
+[triton-dejavu] First execution including JIT compilation took 2.321138620376587s.
+[triton-dejavu] First execution including JIT compilation took 0.5398764610290527s.
+[triton-dejavu] First execution including JIT compilation took 0.3589463233947754s.
+[triton-dejavu] First execution including JIT compilation took 2.2305822372436523s.
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.4210233688354492s.
+[triton-dejavu] First execution including JIT compilation took 0.2671318054199219s.
+[triton-dejavu] First execution including JIT compilation took 0.20480823516845703s.
+[triton-dejavu] First execution including JIT compilation took 0.36168575286865234s.
+[triton-dejavu] First execution including JIT compilation took 0.2831258773803711s.
+[triton-dejavu] First execution including JIT compilation took 0.22981572151184082s.
+[triton-dejavu] First execution including JIT compilation took 0.3903160095214844s.
+[triton-dejavu] First execution including JIT compilation took 0.2804446220397949s.
+[triton-dejavu] First execution including JIT compilation took 0.22222447395324707s.
+[triton-dejavu] First execution including JIT compilation took 0.3762855529785156s.
+[triton-dejavu] First execution including JIT compilation took 0.2824244499206543s.
+[triton-dejavu] First execution including JIT compilation took 0.22802186012268066s.
+[triton-dejavu] First execution including JIT compilation took 0.4042317867279053s.
+[triton-dejavu] First execution including JIT compilation took 0.2850303649902344s.
+[triton-dejavu] First execution including JIT compilation took 0.22367358207702637s.
+[triton-dejavu] First execution including JIT compilation took 0.45253777503967285s.
+[triton-dejavu] First execution including JIT compilation took 0.3078906536102295s.
+[triton-dejavu] First execution including JIT compilation took 0.23833608627319336s.
+[triton-dejavu] First execution including JIT compilation took 0.47820162773132324s.
+[triton-dejavu] First execution including JIT compilation took 0.332599401473999s.
+[triton-dejavu] First execution including JIT compilation took 0.256058931350708s.
+[triton-dejavu] First execution including JIT compilation took 0.4336233139038086s.
+[triton-dejavu] First execution including JIT compilation took 0.2906990051269531s.
+[triton-dejavu] First execution including JIT compilation took 0.22593021392822266s.
+[triton-dejavu] First execution including JIT compilation took 0.43659496307373047s.
+[triton-dejavu] First execution including JIT compilation took 0.295365571975708s.
+[triton-dejavu] First execution including JIT compilation took 0.3340928554534912s.
+[triton-dejavu] First execution including JIT compilation took 0.4568207263946533s.
+[triton-dejavu] First execution including JIT compilation took 0.34474825859069824s.
+[triton-dejavu] First execution including JIT compilation took 0.2425243854522705s.
+[triton-dejavu] First execution including JIT compilation took 0.48821306228637695s.
+[triton-dejavu] First execution including JIT compilation took 0.0030794143676757812s.
+[triton-dejavu] First execution including JIT compilation took 0.666248083114624s.
+[triton-dejavu] First execution including JIT compilation took 0.9645810127258301s.
+[triton-dejavu] First execution including JIT compilation took 0.43552494049072266s.
+[triton-dejavu] First execution including JIT compilation took 0.3096005916595459s.
+[triton-dejavu] First execution including JIT compilation took 0.6565234661102295s.
+[triton-dejavu] First execution including JIT compilation took 0.49860286712646484s.
+[triton-dejavu] First execution including JIT compilation took 0.31050992012023926s.
+[triton-dejavu] First execution including JIT compilation took 0.696462869644165s.
+[triton-dejavu] First execution including JIT compilation took 0.4284684658050537s.
+[triton-dejavu] First execution including JIT compilation took 0.3179745674133301s.
+[triton-dejavu] First execution including JIT compilation took 0.6832358837127686s.
+[triton-dejavu] First execution including JIT compilation took 0.4428989887237549s.
+[triton-dejavu] First execution including JIT compilation took 0.27704954147338867s.
+[triton-dejavu] First execution including JIT compilation took 0.7220911979675293s.
+[triton-dejavu] First execution including JIT compilation took 0.4185624122619629s.
+[triton-dejavu] First execution including JIT compilation took 0.2997853755950928s.
+[triton-dejavu] First execution including JIT compilation took 0.769294023513794s.
+[triton-dejavu] First execution including JIT compilation took 0.44492197036743164s.
+[triton-dejavu] First execution including JIT compilation took 0.3919029235839844s.
+[triton-dejavu] First execution including JIT compilation took 0.8174152374267578s.
+[triton-dejavu] First execution including JIT compilation took 0.4800558090209961s.
+[triton-dejavu] First execution including JIT compilation took 0.3278632164001465s.
+[triton-dejavu] First execution including JIT compilation took 0.8820762634277344s.
+[triton-dejavu] First execution including JIT compilation took 0.4979724884033203s.
+[triton-dejavu] First execution including JIT compilation took 0.3491017818450928s.
+[triton-dejavu] First execution including JIT compilation took 0.9607341289520264s.
+[triton-dejavu] First execution including JIT compilation took 0.5307338237762451s.
+[triton-dejavu] First execution including JIT compilation took 0.3707716464996338s.
+[triton-dejavu] First execution including JIT compilation took 1.0402915477752686s.
+[triton-dejavu] First execution including JIT compilation took 0.5747923851013184s.
+[triton-dejavu] First execution including JIT compilation took 0.3863534927368164s.
+[triton-dejavu] First execution including JIT compilation took 1.1301944255828857s.
+[triton-dejavu] First execution including JIT compilation took 0.5652072429656982s.
+[triton-dejavu] First execution including JIT compilation took 0.34700870513916016s.
+[triton-dejavu] First execution including JIT compilation took 1.0828943252563477s.
+[triton-dejavu] First execution including JIT compilation took 0.5150623321533203s.
+[triton-dejavu] First execution including JIT compilation took 0.2979111671447754s.
+[triton-dejavu] First execution including JIT compilation took 1.529569149017334s.
+[triton-dejavu] First execution including JIT compilation took 0.6551094055175781s.
+[triton-dejavu] First execution including JIT compilation took 0.38155317306518555s.
+[triton-dejavu] First execution including JIT compilation took 1.766725778579712s.
+[triton-dejavu] First execution including JIT compilation took 0.6843061447143555s.
+[triton-dejavu] First execution including JIT compilation took 0.4045257568359375s.
+[triton-dejavu] First execution including JIT compilation took 1.8947250843048096s.
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.4253039360046387s.
+[triton-dejavu] First execution including JIT compilation took 1.1816015243530273s.
+[triton-dejavu] First execution including JIT compilation took 0.4922316074371338s.
+[triton-dejavu] First execution including JIT compilation took 2.298893451690674s.
+[triton-dejavu] First execution including JIT compilation took 1.2072784900665283s.
+[triton-dejavu] First execution including JIT compilation took 0.5224888324737549s.
+[triton-dejavu] First execution including JIT compilation took 6.951720952987671s.
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.6793637275695801s.
+[triton-dejavu] First execution including JIT compilation took 0.6418313980102539s.
+[triton-dejavu] First execution including JIT compilation took 0.32692575454711914s.
+[triton-dejavu] First execution including JIT compilation took 0.8132402896881104s.
+[triton-dejavu] First execution including JIT compilation took 0.5734570026397705s.
+[triton-dejavu] First execution including JIT compilation took 0.35916733741760254s.
+[triton-dejavu] First execution including JIT compilation took 0.8288097381591797s.
+[triton-dejavu] First execution including JIT compilation took 0.5815913677215576s.
+[triton-dejavu] First execution including JIT compilation took 0.39780449867248535s.
+[triton-dejavu] First execution including JIT compilation took 0.8627204895019531s.
+[triton-dejavu] First execution including JIT compilation took 0.5819535255432129s.
+[triton-dejavu] First execution including JIT compilation took 0.3681964874267578s.
+[triton-dejavu] First execution including JIT compilation took 0.8420388698577881s.
+[triton-dejavu] First execution including JIT compilation took 0.5943279266357422s.
+[triton-dejavu] First execution including JIT compilation took 0.36092662811279297s.
+[triton-dejavu] First execution including JIT compilation took 0.8624413013458252s.
+[triton-dejavu] First execution including JIT compilation took 0.5882468223571777s.
+[triton-dejavu] First execution including JIT compilation took 0.3868570327758789s.
+[triton-dejavu] First execution including JIT compilation took 0.9039130210876465s.
+[triton-dejavu] First execution including JIT compilation took 0.6410880088806152s.
+[triton-dejavu] First execution including JIT compilation took 0.3988831043243408s.
+[triton-dejavu] First execution including JIT compilation took 0.9115607738494873s.
+[triton-dejavu] First execution including JIT compilation took 0.5902762413024902s.
+[triton-dejavu] First execution including JIT compilation took 0.34618401527404785s.
+[triton-dejavu] First execution including JIT compilation took 0.9532392024993896s.
+[triton-dejavu] First execution including JIT compilation took 0.635444164276123s.
+[triton-dejavu] First execution including JIT compilation took 0.3781321048736572s.
+[triton-dejavu] First execution including JIT compilation took 1.0092928409576416s.
+[triton-dejavu] First execution including JIT compilation took 0.6709246635437012s.
+[triton-dejavu] First execution including JIT compilation took 0.38914012908935547s.
+[triton-dejavu] First execution including JIT compilation took 1.0928781032562256s.
+[triton-dejavu] First execution including JIT compilation took 0.003269672393798828s.
+[triton-dejavu] First execution including JIT compilation took 0.3971376419067383s.
+[triton-dejavu] First execution including JIT compilation took 1.0150482654571533s.
+[triton-dejavu] First execution including JIT compilation took 0.6963634490966797s.
+[triton-dejavu] First execution including JIT compilation took 0.40409111976623535s.
+[triton-dejavu] First execution including JIT compilation took 1.1118721961975098s.
+[triton-dejavu] First execution including JIT compilation took 0.6946852207183838s.
+[triton-dejavu] First execution including JIT compilation took 0.4175405502319336s.
+[triton-dejavu] First execution including JIT compilation took 1.177678108215332s.
+[triton-dejavu] First execution including JIT compilation took 0.7395210266113281s.
+[triton-dejavu] First execution including JIT compilation took 0.4114506244659424s.
+[triton-dejavu] First execution including JIT compilation took 1.3139593601226807s.
+[triton-dejavu] First execution including JIT compilation took 0.7956829071044922s.
+[triton-dejavu] First execution including JIT compilation took 0.39879727363586426s.
+[triton-dejavu] First execution including JIT compilation took 1.328862190246582s.
+[triton-dejavu] First execution including JIT compilation took 0.6896190643310547s.
+[triton-dejavu] First execution including JIT compilation took 0.33518552780151367s.
+[triton-dejavu] First execution including JIT compilation took 1.5963466167449951s.
+[triton-dejavu] First execution including JIT compilation took 0.7468023300170898s.
+[triton-dejavu] First execution including JIT compilation took 0.4028303623199463s.
+[triton-dejavu] First execution including JIT compilation took 1.7192442417144775s.
+[triton-dejavu] First execution including JIT compilation took 0.7985246181488037s.
+[triton-dejavu] First execution including JIT compilation took 0.37429141998291016s.
+[triton-dejavu] First execution including JIT compilation took 1.7711453437805176s.
+[triton-dejavu] First execution including JIT compilation took 0.9907217025756836s.
+[triton-dejavu] First execution including JIT compilation took 0.49533724784851074s.
+[triton-dejavu] First execution including JIT compilation took 2.3483541011810303s.
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.211524486541748s.
+[triton-dejavu] First execution including JIT compilation took 1.2513744831085205s.
+[triton-dejavu] First execution including JIT compilation took 0.4271657466888428s.
+[triton-dejavu] First execution including JIT compilation took 2.204448938369751s.
+[triton-dejavu] First execution including JIT compilation took 1.185486078262329s.
+[triton-dejavu] First execution including JIT compilation took 0.5369844436645508s.
+[triton-dejavu] First execution including JIT compilation took 6.1959922313690186s.
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 5.166867017745972s.
+[triton-dejavu] First execution including JIT compilation took 3.8762850761413574s.
+[triton-dejavu] First execution including JIT compilation took 0.7830004692077637s.
+[triton-dejavu] First execution including JIT compilation took 6.120173931121826s.
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1146880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1146880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.33310794830322266s.
+[triton-dejavu] First execution including JIT compilation took 0.23391294479370117s.
+[triton-dejavu] First execution including JIT compilation took 0.2084214687347412s.
+[triton-dejavu] First execution including JIT compilation took 0.3005564212799072s.
+[triton-dejavu] First execution including JIT compilation took 0.2554941177368164s.
+[triton-dejavu] First execution including JIT compilation took 0.21907782554626465s.
+[triton-dejavu] First execution including JIT compilation took 0.31569480895996094s.
+[triton-dejavu] First execution including JIT compilation took 0.2694690227508545s.
+[triton-dejavu] First execution including JIT compilation took 0.22438645362854004s.
+[triton-dejavu] First execution including JIT compilation took 0.3229238986968994s.
+[triton-dejavu] First execution including JIT compilation took 0.2797393798828125s.
+[triton-dejavu] First execution including JIT compilation took 0.23162508010864258s.
+[triton-dejavu] First execution including JIT compilation took 0.3365445137023926s.
+[triton-dejavu] First execution including JIT compilation took 0.2754044532775879s.
+[triton-dejavu] First execution including JIT compilation took 0.22548437118530273s.
+[triton-dejavu] First execution including JIT compilation took 0.3373396396636963s.
+[triton-dejavu] First execution including JIT compilation took 0.2857697010040283s.
+[triton-dejavu] First execution including JIT compilation took 0.2294597625732422s.
+[triton-dejavu] First execution including JIT compilation took 0.3634176254272461s.
+[triton-dejavu] First execution including JIT compilation took 0.294708251953125s.
+[triton-dejavu] First execution including JIT compilation took 0.24236321449279785s.
+[triton-dejavu] First execution including JIT compilation took 0.3028702735900879s.
+[triton-dejavu] First execution including JIT compilation took 0.2470991611480713s.
+[triton-dejavu] First execution including JIT compilation took 0.21360516548156738s.
+[triton-dejavu] First execution including JIT compilation took 0.3189256191253662s.
+[triton-dejavu] First execution including JIT compilation took 0.25740885734558105s.
+[triton-dejavu] First execution including JIT compilation took 0.23542547225952148s.
+[triton-dejavu] First execution including JIT compilation took 0.34380078315734863s.
+[triton-dejavu] First execution including JIT compilation took 0.2774670124053955s.
+[triton-dejavu] First execution including JIT compilation took 0.24950265884399414s.
+[triton-dejavu] First execution including JIT compilation took 0.4161198139190674s.
+[triton-dejavu] First execution including JIT compilation took 0.28986072540283203s.
+[triton-dejavu] First execution including JIT compilation took 0.2589759826660156s.
+[triton-dejavu] First execution including JIT compilation took 0.41210365295410156s.
+[triton-dejavu] First execution including JIT compilation took 0.32729268074035645s.
+[triton-dejavu] First execution including JIT compilation took 0.25850629806518555s.
+[triton-dejavu] First execution including JIT compilation took 0.4299044609069824s.
+[triton-dejavu] First execution including JIT compilation took 0.3116121292114258s.
+[triton-dejavu] First execution including JIT compilation took 0.27123379707336426s.
+[triton-dejavu] First execution including JIT compilation took 0.45281362533569336s.
+[triton-dejavu] First execution including JIT compilation took 0.3351759910583496s.
+[triton-dejavu] First execution including JIT compilation took 0.2787160873413086s.
+[triton-dejavu] First execution including JIT compilation took 0.41561436653137207s.
+[triton-dejavu] First execution including JIT compilation took 0.27190589904785156s.
+[triton-dejavu] First execution including JIT compilation took 0.2324838638305664s.
+[triton-dejavu] First execution including JIT compilation took 0.4087650775909424s.
+[triton-dejavu] First execution including JIT compilation took 0.28690099716186523s.
+[triton-dejavu] First execution including JIT compilation took 0.24116730690002441s.
+[triton-dejavu] First execution including JIT compilation took 0.5066123008728027s.
+[triton-dejavu] First execution including JIT compilation took 0.3034372329711914s.
+[triton-dejavu] First execution including JIT compilation took 0.25580596923828125s.
+[triton-dejavu] First execution including JIT compilation took 0.525223970413208s.
+[triton-dejavu] First execution including JIT compilation took 0.33296680450439453s.
+[triton-dejavu] First execution including JIT compilation took 0.27128124237060547s.
+[triton-dejavu] First execution including JIT compilation took 0.5657172203063965s.
+[triton-dejavu] First execution including JIT compilation took 0.3399391174316406s.
+[triton-dejavu] First execution including JIT compilation took 0.28380680084228516s.
+[triton-dejavu] First execution including JIT compilation took 0.6111602783203125s.
+[triton-dejavu] First execution including JIT compilation took 0.36371636390686035s.
+[triton-dejavu] First execution including JIT compilation took 0.3011593818664551s.
+[triton-dejavu] First execution including JIT compilation took 0.7230055332183838s.
+[triton-dejavu] First execution including JIT compilation took 0.4232914447784424s.
+[triton-dejavu] First execution including JIT compilation took 0.31528306007385254s.
+[triton-dejavu] First execution including JIT compilation took 0.6461219787597656s.
+[triton-dejavu] First execution including JIT compilation took 0.36070823669433594s.
+[triton-dejavu] First execution including JIT compilation took 0.2686340808868408s.
+[triton-dejavu] First execution including JIT compilation took 0.6663899421691895s.
+[triton-dejavu] First execution including JIT compilation took 0.3726685047149658s.
+[triton-dejavu] First execution including JIT compilation took 0.2806117534637451s.
+[triton-dejavu] First execution including JIT compilation took 1.2110939025878906s.
+[triton-dejavu] First execution including JIT compilation took 0.43669724464416504s.
+[triton-dejavu] First execution including JIT compilation took 0.29979729652404785s.
+[triton-dejavu] First execution including JIT compilation took 1.2734310626983643s.
+[triton-dejavu] First execution including JIT compilation took 0.4524815082550049s.
+[triton-dejavu] First execution including JIT compilation took 0.30893588066101074s.
+[triton-dejavu] First execution including JIT compilation took 1.3412039279937744s.
+[triton-dejavu] First execution including JIT compilation took 0.4808011054992676s.
+[triton-dejavu] First execution including JIT compilation took 0.32793712615966797s.
+[triton-dejavu] First execution including JIT compilation took 1.3745002746582031s.
+[triton-dejavu] First execution including JIT compilation took 0.4310414791107178s.
+[triton-dejavu] First execution including JIT compilation took 0.2714354991912842s.
+bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.2250773906707764s.
+[triton-dejavu] First execution including JIT compilation took 0.4646158218383789s.
+[triton-dejavu] First execution including JIT compilation took 0.2895984649658203s.
+[triton-dejavu] First execution including JIT compilation took 1.310636043548584s.
+[triton-dejavu] First execution including JIT compilation took 0.5870482921600342s.
+[triton-dejavu] First execution including JIT compilation took 0.36336755752563477s.
+[triton-dejavu] First execution including JIT compilation took 5.4522223472595215s.
+[triton-dejavu] First execution including JIT compilation took 0.9788007736206055s.
+[triton-dejavu] First execution including JIT compilation took 0.3662402629852295s.
+[triton-dejavu] First execution including JIT compilation took 5.491236209869385s.
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.3520832061767578s.
+[triton-dejavu] First execution including JIT compilation took 0.3299834728240967s.
+[triton-dejavu] First execution including JIT compilation took 0.21879220008850098s.
+[triton-dejavu] First execution including JIT compilation took 0.3383035659790039s.
+[triton-dejavu] First execution including JIT compilation took 0.2801651954650879s.
+[triton-dejavu] First execution including JIT compilation took 0.23688268661499023s.
+[triton-dejavu] First execution including JIT compilation took 0.3647580146789551s.
+[triton-dejavu] First execution including JIT compilation took 0.2947418689727783s.
+[triton-dejavu] First execution including JIT compilation took 0.27162957191467285s.
+[triton-dejavu] First execution including JIT compilation took 0.38416576385498047s.
+[triton-dejavu] First execution including JIT compilation took 0.30350494384765625s.
+[triton-dejavu] First execution including JIT compilation took 0.2574441432952881s.
+[triton-dejavu] First execution including JIT compilation took 0.4012939929962158s.
+[triton-dejavu] First execution including JIT compilation took 0.31052398681640625s.
+[triton-dejavu] First execution including JIT compilation took 0.26515769958496094s.
+[triton-dejavu] First execution including JIT compilation took 0.39237308502197266s.
+[triton-dejavu] First execution including JIT compilation took 0.3096015453338623s.
+[triton-dejavu] First execution including JIT compilation took 0.262850284576416s.
+[triton-dejavu] First execution including JIT compilation took 0.4091818332672119s.
+[triton-dejavu] First execution including JIT compilation took 0.321216344833374s.
+[triton-dejavu] First execution including JIT compilation took 0.2732977867126465s.
+[triton-dejavu] First execution including JIT compilation took 0.4301795959472656s.
+[triton-dejavu] First execution including JIT compilation took 0.2619006633758545s.
+[triton-dejavu] First execution including JIT compilation took 0.22098731994628906s.
+[triton-dejavu] First execution including JIT compilation took 0.37630200386047363s.
+[triton-dejavu] First execution including JIT compilation took 0.34072089195251465s.
+[triton-dejavu] First execution including JIT compilation took 0.23756647109985352s.
+[triton-dejavu] First execution including JIT compilation took 0.41828155517578125s.
+[triton-dejavu] First execution including JIT compilation took 0.30147528648376465s.
+[triton-dejavu] First execution including JIT compilation took 0.2543652057647705s.
+[triton-dejavu] First execution including JIT compilation took 0.43347787857055664s.
+[triton-dejavu] First execution including JIT compilation took 0.0028808116912841797s.
+[triton-dejavu] First execution including JIT compilation took 0.2641146183013916s.
+[triton-dejavu] First execution including JIT compilation took 0.530811071395874s.
+[triton-dejavu] First execution including JIT compilation took 0.3217940330505371s.
+[triton-dejavu] First execution including JIT compilation took 0.28223276138305664s.
+[triton-dejavu] First execution including JIT compilation took 0.47287917137145996s.
+[triton-dejavu] First execution including JIT compilation took 0.3476870059967041s.
+[triton-dejavu] First execution including JIT compilation took 0.28547072410583496s.
+[triton-dejavu] First execution including JIT compilation took 0.524724006652832s.
+[triton-dejavu] First execution including JIT compilation took 0.36275696754455566s.
+[triton-dejavu] First execution including JIT compilation took 0.2947351932525635s.
+[triton-dejavu] First execution including JIT compilation took 0.4834451675415039s.
+[triton-dejavu] First execution including JIT compilation took 0.31950998306274414s.
+[triton-dejavu] First execution including JIT compilation took 0.2404794692993164s.
+[triton-dejavu] First execution including JIT compilation took 0.5020296573638916s.
+[triton-dejavu] First execution including JIT compilation took 0.32535886764526367s.
+[triton-dejavu] First execution including JIT compilation took 0.2655165195465088s.
+[triton-dejavu] First execution including JIT compilation took 0.5749058723449707s.
+[triton-dejavu] First execution including JIT compilation took 0.35364580154418945s.
+[triton-dejavu] First execution including JIT compilation took 0.26879262924194336s.
+[triton-dejavu] First execution including JIT compilation took 0.5956721305847168s.
+[triton-dejavu] First execution including JIT compilation took 0.3535337448120117s.
+[triton-dejavu] First execution including JIT compilation took 0.28673887252807617s.
+[triton-dejavu] First execution including JIT compilation took 0.6367616653442383s.
+[triton-dejavu] First execution including JIT compilation took 0.3775303363800049s.
+[triton-dejavu] First execution including JIT compilation took 0.3009200096130371s.
+[triton-dejavu] First execution including JIT compilation took 0.6813859939575195s.
+[triton-dejavu] First execution including JIT compilation took 0.4030306339263916s.
+[triton-dejavu] First execution including JIT compilation took 0.29657673835754395s.
+[triton-dejavu] First execution including JIT compilation took 0.7835328578948975s.
+[triton-dejavu] First execution including JIT compilation took 0.4300117492675781s.
+[triton-dejavu] First execution including JIT compilation took 0.3349874019622803s.
+[triton-dejavu] First execution including JIT compilation took 0.7587988376617432s.
+[triton-dejavu] First execution including JIT compilation took 0.4517331123352051s.
+[triton-dejavu] First execution including JIT compilation took 0.28656530380249023s.
+[triton-dejavu] First execution including JIT compilation took 0.7893960475921631s.
+[triton-dejavu] First execution including JIT compilation took 0.4339447021484375s.
+[triton-dejavu] First execution including JIT compilation took 0.3180868625640869s.
+[triton-dejavu] First execution including JIT compilation took 1.3489327430725098s.
+[triton-dejavu] First execution including JIT compilation took 0.5434455871582031s.
+[triton-dejavu] First execution including JIT compilation took 0.35109591484069824s.
+[triton-dejavu] First execution including JIT compilation took 1.4147284030914307s.
+[triton-dejavu] First execution including JIT compilation took 0.5231330394744873s.
+[triton-dejavu] First execution including JIT compilation took 0.35427212715148926s.
+[triton-dejavu] First execution including JIT compilation took 1.4746348857879639s.
+[triton-dejavu] First execution including JIT compilation took 0.561915397644043s.
+[triton-dejavu] First execution including JIT compilation took 0.3480250835418701s.
+[triton-dejavu] First execution including JIT compilation took 1.5745019912719727s.
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.5983943939208984s.
+[triton-dejavu] First execution including JIT compilation took 0.6874723434448242s.
+[triton-dejavu] First execution including JIT compilation took 0.3813052177429199s.
+[triton-dejavu] First execution including JIT compilation took 1.7068426609039307s.
+[triton-dejavu] First execution including JIT compilation took 0.6691153049468994s.
+[triton-dejavu] First execution including JIT compilation took 0.39803266525268555s.
+[triton-dejavu] First execution including JIT compilation took 5.79765248298645s.
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.3679988384246826s.
+[triton-dejavu] First execution including JIT compilation took 0.2748281955718994s.
+[triton-dejavu] First execution including JIT compilation took 0.2133641242980957s.
+[triton-dejavu] First execution including JIT compilation took 0.42013049125671387s.
+[triton-dejavu] First execution including JIT compilation took 0.30323004722595215s.
+[triton-dejavu] First execution including JIT compilation took 0.2673945426940918s.
+[triton-dejavu] First execution including JIT compilation took 0.49060797691345215s.
+[triton-dejavu] First execution including JIT compilation took 0.37160611152648926s.
+[triton-dejavu] First execution including JIT compilation took 0.2765369415283203s.
+[triton-dejavu] First execution including JIT compilation took 0.5039165019989014s.
+[triton-dejavu] First execution including JIT compilation took 0.37934136390686035s.
+[triton-dejavu] First execution including JIT compilation took 0.2831428050994873s.
+[triton-dejavu] First execution including JIT compilation took 0.5394682884216309s.
+[triton-dejavu] First execution including JIT compilation took 0.37555360794067383s.
+[triton-dejavu] First execution including JIT compilation took 0.2974073886871338s.
+[triton-dejavu] First execution including JIT compilation took 0.5276486873626709s.
+[triton-dejavu] First execution including JIT compilation took 0.39134836196899414s.
+[triton-dejavu] First execution including JIT compilation took 0.2950737476348877s.
+[triton-dejavu] First execution including JIT compilation took 0.5684738159179688s.
+[triton-dejavu] First execution including JIT compilation took 0.41124916076660156s.
+[triton-dejavu] First execution including JIT compilation took 0.3004477024078369s.
+[triton-dejavu] First execution including JIT compilation took 0.5164830684661865s.
+[triton-dejavu] First execution including JIT compilation took 0.33581042289733887s.
+[triton-dejavu] First execution including JIT compilation took 0.27167344093322754s.
+[triton-dejavu] First execution including JIT compilation took 0.5106401443481445s.
+[triton-dejavu] First execution including JIT compilation took 0.37090396881103516s.
+[triton-dejavu] First execution including JIT compilation took 0.2658994197845459s.
+[triton-dejavu] First execution including JIT compilation took 0.5844974517822266s.
+[triton-dejavu] First execution including JIT compilation took 0.3731074333190918s.
+[triton-dejavu] First execution including JIT compilation took 0.31909990310668945s.
+[triton-dejavu] First execution including JIT compilation took 0.5862879753112793s.
+[triton-dejavu] First execution including JIT compilation took 0.0029494762420654297s.
+[triton-dejavu] First execution including JIT compilation took 0.290740966796875s.
+[triton-dejavu] First execution including JIT compilation took 0.6013433933258057s.
+[triton-dejavu] First execution including JIT compilation took 0.4201853275299072s.
+[triton-dejavu] First execution including JIT compilation took 0.30014801025390625s.
+[triton-dejavu] First execution including JIT compilation took 0.6341700553894043s.
+[triton-dejavu] First execution including JIT compilation took 0.4125685691833496s.
+[triton-dejavu] First execution including JIT compilation took 0.3149580955505371s.
+[triton-dejavu] First execution including JIT compilation took 0.7038888931274414s.
+[triton-dejavu] First execution including JIT compilation took 0.44381022453308105s.
+[triton-dejavu] First execution including JIT compilation took 0.3345675468444824s.
+[triton-dejavu] First execution including JIT compilation took 0.7102112770080566s.
+[triton-dejavu] First execution including JIT compilation took 0.39132046699523926s.
+[triton-dejavu] First execution including JIT compilation took 0.2966330051422119s.
+[triton-dejavu] First execution including JIT compilation took 0.7261581420898438s.
+[triton-dejavu] First execution including JIT compilation took 0.42345237731933594s.
+[triton-dejavu] First execution including JIT compilation took 0.31378960609436035s.
+[triton-dejavu] First execution including JIT compilation took 0.7939469814300537s.
+[triton-dejavu] First execution including JIT compilation took 0.45282721519470215s.
+[triton-dejavu] First execution including JIT compilation took 0.3177626132965088s.
+[triton-dejavu] First execution including JIT compilation took 0.8336560726165771s.
+[triton-dejavu] First execution including JIT compilation took 0.35431385040283203s.
+[triton-dejavu] First execution including JIT compilation took 0.32625389099121094s.
+[triton-dejavu] First execution including JIT compilation took 0.768075704574585s.
+[triton-dejavu] First execution including JIT compilation took 0.3967933654785156s.
+[triton-dejavu] First execution including JIT compilation took 0.27690625190734863s.
+[triton-dejavu] First execution including JIT compilation took 0.9250342845916748s.
+[triton-dejavu] First execution including JIT compilation took 0.49423885345458984s.
+[triton-dejavu] First execution including JIT compilation took 0.34920620918273926s.
+[triton-dejavu] First execution including JIT compilation took 1.0775840282440186s.
+[triton-dejavu] First execution including JIT compilation took 0.5416042804718018s.
+[triton-dejavu] First execution including JIT compilation took 0.38259077072143555s.
+[triton-dejavu] First execution including JIT compilation took 1.1039273738861084s.
+[triton-dejavu] First execution including JIT compilation took 0.526303768157959s.
+[triton-dejavu] First execution including JIT compilation took 0.34534621238708496s.
+[triton-dejavu] First execution including JIT compilation took 1.1143405437469482s.
+[triton-dejavu] First execution including JIT compilation took 0.5508031845092773s.
+[triton-dejavu] First execution including JIT compilation took 0.37677478790283203s.
+[triton-dejavu] First execution including JIT compilation took 1.8315963745117188s.
+[triton-dejavu] First execution including JIT compilation took 0.6505274772644043s.
+[triton-dejavu] First execution including JIT compilation took 0.39488959312438965s.
+[triton-dejavu] First execution including JIT compilation took 1.9625489711761475s.
+[triton-dejavu] First execution including JIT compilation took 0.6776554584503174s.
+[triton-dejavu] First execution including JIT compilation took 0.41101694107055664s.
+[triton-dejavu] First execution including JIT compilation took 2.0118651390075684s.
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.172940969467163s.
+[triton-dejavu] First execution including JIT compilation took 0.9175102710723877s.
+[triton-dejavu] First execution including JIT compilation took 0.42366957664489746s.
+[triton-dejavu] First execution including JIT compilation took 2.0173258781433105s.
+[triton-dejavu] First execution including JIT compilation took 0.7885754108428955s.
+[triton-dejavu] First execution including JIT compilation took 0.44706130027770996s.
+[triton-dejavu] First execution including JIT compilation took 7.324063301086426s.
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.7107067108154297s.
+[triton-dejavu] First execution including JIT compilation took 0.40702342987060547s.
+[triton-dejavu] First execution including JIT compilation took 0.271730899810791s.
+[triton-dejavu] First execution including JIT compilation took 0.7312145233154297s.
+[triton-dejavu] First execution including JIT compilation took 0.4074242115020752s.
+[triton-dejavu] First execution including JIT compilation took 0.2868027687072754s.
+[triton-dejavu] First execution including JIT compilation took 0.7007474899291992s.
+[triton-dejavu] First execution including JIT compilation took 0.42080259323120117s.
+[triton-dejavu] First execution including JIT compilation took 0.27320408821105957s.
+[triton-dejavu] First execution including JIT compilation took 0.722841739654541s.
+[triton-dejavu] First execution including JIT compilation took 0.5550060272216797s.
+[triton-dejavu] First execution including JIT compilation took 0.32157206535339355s.
+[triton-dejavu] First execution including JIT compilation took 0.8072361946105957s.
+[triton-dejavu] First execution including JIT compilation took 0.43352651596069336s.
+[triton-dejavu] First execution including JIT compilation took 0.2982165813446045s.
+[triton-dejavu] First execution including JIT compilation took 0.7527244091033936s.
+[triton-dejavu] First execution including JIT compilation took 0.4649670124053955s.
+[triton-dejavu] First execution including JIT compilation took 0.3391098976135254s.
+[triton-dejavu] First execution including JIT compilation took 0.936931848526001s.
+[triton-dejavu] First execution including JIT compilation took 0.46184659004211426s.
+[triton-dejavu] First execution including JIT compilation took 0.2983987331390381s.
+[triton-dejavu] First execution including JIT compilation took 0.7631199359893799s.
+[triton-dejavu] First execution including JIT compilation took 0.39908528327941895s.
+[triton-dejavu] First execution including JIT compilation took 0.32989048957824707s.
+[triton-dejavu] First execution including JIT compilation took 0.7596316337585449s.
+[triton-dejavu] First execution including JIT compilation took 0.43782997131347656s.
+[triton-dejavu] First execution including JIT compilation took 0.3047447204589844s.
+[triton-dejavu] First execution including JIT compilation took 0.8982362747192383s.
+[triton-dejavu] First execution including JIT compilation took 0.4925217628479004s.
+[triton-dejavu] First execution including JIT compilation took 0.3316771984100342s.
+[triton-dejavu] First execution including JIT compilation took 0.864621639251709s.
+[triton-dejavu] First execution including JIT compilation took 0.016417741775512695s.
+[triton-dejavu] First execution including JIT compilation took 0.3927609920501709s.
+[triton-dejavu] First execution including JIT compilation took 0.8940439224243164s.
+[triton-dejavu] First execution including JIT compilation took 0.4808948040008545s.
+[triton-dejavu] First execution including JIT compilation took 0.3320889472961426s.
+[triton-dejavu] First execution including JIT compilation took 0.9511239528656006s.
+[triton-dejavu] First execution including JIT compilation took 0.510263204574585s.
+[triton-dejavu] First execution including JIT compilation took 0.3106980323791504s.
+[triton-dejavu] First execution including JIT compilation took 0.9828391075134277s.
+[triton-dejavu] First execution including JIT compilation took 0.6096630096435547s.
+[triton-dejavu] First execution including JIT compilation took 0.34572386741638184s.
+[triton-dejavu] First execution including JIT compilation took 1.0500340461730957s.
+[triton-dejavu] First execution including JIT compilation took 0.4872100353240967s.
+[triton-dejavu] First execution including JIT compilation took 0.3133056163787842s.
+[triton-dejavu] First execution including JIT compilation took 1.0500223636627197s.
+[triton-dejavu] First execution including JIT compilation took 0.5330610275268555s.
+[triton-dejavu] First execution including JIT compilation took 0.3301053047180176s.
+[triton-dejavu] First execution including JIT compilation took 1.1683359146118164s.
+[triton-dejavu] First execution including JIT compilation took 0.5536503791809082s.
+[triton-dejavu] First execution including JIT compilation took 0.34630656242370605s.
+[triton-dejavu] First execution including JIT compilation took 1.6548552513122559s.
+[triton-dejavu] First execution including JIT compilation took 0.7423355579376221s.
+[triton-dejavu] First execution including JIT compilation took 0.4386255741119385s.
+[triton-dejavu] First execution including JIT compilation took 1.7830908298492432s.
+[triton-dejavu] First execution including JIT compilation took 0.7749922275543213s.
+[triton-dejavu] First execution including JIT compilation took 0.4658083915710449s.
+[triton-dejavu] First execution including JIT compilation took 1.905794382095337s.
+[triton-dejavu] First execution including JIT compilation took 0.816021203994751s.
+[triton-dejavu] First execution including JIT compilation took 0.4723987579345703s.
+[triton-dejavu] First execution including JIT compilation took 1.869170904159546s.
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.789454460144043s.
+[triton-dejavu] First execution including JIT compilation took 0.6909325122833252s.
+[triton-dejavu] First execution including JIT compilation took 0.4053471088409424s.
+[triton-dejavu] First execution including JIT compilation took 1.7492396831512451s.
+[triton-dejavu] First execution including JIT compilation took 0.7165470123291016s.
+[triton-dejavu] First execution including JIT compilation took 0.4185338020324707s.
+[triton-dejavu] First execution including JIT compilation took 2.8342366218566895s.
+[triton-dejavu] First execution including JIT compilation took 0.8270382881164551s.
+[triton-dejavu] First execution including JIT compilation took 0.4604911804199219s.
+[triton-dejavu] First execution including JIT compilation took 2.927734851837158s.
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 4.0586724281311035s.
+[triton-dejavu] First execution including JIT compilation took 1.5080604553222656s.
+[triton-dejavu] First execution including JIT compilation took 0.6953163146972656s.
+[triton-dejavu] First execution including JIT compilation took 4.226686000823975s.
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 917504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 917504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.3607323169708252s.
+[triton-dejavu] First execution including JIT compilation took 0.7990188598632812s.
+[triton-dejavu] First execution including JIT compilation took 0.39726877212524414s.
+[triton-dejavu] First execution including JIT compilation took 1.393247127532959s.
+[triton-dejavu] First execution including JIT compilation took 0.9832372665405273s.
+[triton-dejavu] First execution including JIT compilation took 0.4763679504394531s.
+[triton-dejavu] First execution including JIT compilation took 1.456979513168335s.
+[triton-dejavu] First execution including JIT compilation took 1.0040147304534912s.
+[triton-dejavu] First execution including JIT compilation took 0.4499683380126953s.
+[triton-dejavu] First execution including JIT compilation took 1.467405080795288s.
+[triton-dejavu] First execution including JIT compilation took 1.0723049640655518s.
+[triton-dejavu] First execution including JIT compilation took 0.49906277656555176s.
+[triton-dejavu] First execution including JIT compilation took 1.524533987045288s.
+[triton-dejavu] First execution including JIT compilation took 1.4248688220977783s.
+[triton-dejavu] First execution including JIT compilation took 0.6042609214782715s.
+[triton-dejavu] First execution including JIT compilation took 1.7416322231292725s.
+[triton-dejavu] First execution including JIT compilation took 1.0214593410491943s.
+[triton-dejavu] First execution including JIT compilation took 0.45897865295410156s.
+[triton-dejavu] First execution including JIT compilation took 1.5276007652282715s.
+[triton-dejavu] First execution including JIT compilation took 1.0185387134552002s.
+[triton-dejavu] First execution including JIT compilation took 0.5293161869049072s.
+[triton-dejavu] First execution including JIT compilation took 1.8517167568206787s.
+[triton-dejavu] First execution including JIT compilation took 0.9630119800567627s.
+[triton-dejavu] First execution including JIT compilation took 0.43575310707092285s.
+[triton-dejavu] First execution including JIT compilation took 1.9177396297454834s.
+[triton-dejavu] First execution including JIT compilation took 1.569082498550415s.
+[triton-dejavu] First execution including JIT compilation took 0.622168779373169s.
+[triton-dejavu] First execution including JIT compilation took 2.339301347732544s.
+[triton-dejavu] First execution including JIT compilation took 1.5994513034820557s.
+[triton-dejavu] First execution including JIT compilation took 0.6422829627990723s.
+[triton-dejavu] First execution including JIT compilation took 2.1358773708343506s.
+[triton-dejavu] First execution including JIT compilation took 1.1553890705108643s.
+[triton-dejavu] First execution including JIT compilation took 0.5729074478149414s.
+[triton-dejavu] First execution including JIT compilation took 1.8737192153930664s.
+[triton-dejavu] First execution including JIT compilation took 1.6270005702972412s.
+[triton-dejavu] First execution including JIT compilation took 0.5927095413208008s.
+[triton-dejavu] First execution including JIT compilation took 1.9137556552886963s.
+[triton-dejavu] First execution including JIT compilation took 1.6627833843231201s.
+[triton-dejavu] First execution including JIT compilation took 0.6282734870910645s.
+[triton-dejavu] First execution including JIT compilation took 2.6357598304748535s.
+[triton-dejavu] First execution including JIT compilation took 1.3591229915618896s.
+[triton-dejavu] First execution including JIT compilation took 0.6953067779541016s.
+[triton-dejavu] First execution including JIT compilation took 2.43611741065979s.
+[triton-dejavu] First execution including JIT compilation took 1.2323598861694336s.
+[triton-dejavu] First execution including JIT compilation took 0.6111257076263428s.
+[triton-dejavu] First execution including JIT compilation took 2.841799259185791s.
+[triton-dejavu] First execution including JIT compilation took 1.360656976699829s.
+[triton-dejavu] First execution including JIT compilation took 0.8137938976287842s.
+[triton-dejavu] First execution including JIT compilation took 3.458110809326172s.
+[triton-dejavu] First execution including JIT compilation took 1.5271718502044678s.
+[triton-dejavu] First execution including JIT compilation took 0.0032939910888671875s.
+[triton-dejavu] First execution including JIT compilation took 2.9182276725769043s.
+[triton-dejavu] First execution including JIT compilation took 1.539180040359497s.
+[triton-dejavu] First execution including JIT compilation took 0.6763615608215332s.
+[triton-dejavu] First execution including JIT compilation took 3.089775562286377s.
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 4.155709505081177s.
+[triton-dejavu] First execution including JIT compilation took 2.4426767826080322s.
+[triton-dejavu] First execution including JIT compilation took 1.0379819869995117s.
+[triton-dejavu] First execution including JIT compilation took 4.222529649734497s.
+[triton-dejavu] First execution including JIT compilation took 2.4925472736358643s.
+[triton-dejavu] First execution including JIT compilation took 1.073103666305542s.
+[triton-dejavu] First execution including JIT compilation took 8.762295961380005s.
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+[triton-dejavu] First execution including JIT compilation took 7.50976037979126s.
+[triton-dejavu] First execution including JIT compilation took 3.327193260192871s.
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1376256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1376256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.3581666946411133s.
+[triton-dejavu] First execution including JIT compilation took 0.2740662097930908s.
+[triton-dejavu] First execution including JIT compilation took 0.23471784591674805s.
+[triton-dejavu] First execution including JIT compilation took 0.3980753421783447s.
+[triton-dejavu] First execution including JIT compilation took 0.28090405464172363s.
+[triton-dejavu] First execution including JIT compilation took 0.21105456352233887s.
+[triton-dejavu] First execution including JIT compilation took 0.41014552116394043s.
+[triton-dejavu] First execution including JIT compilation took 0.3041348457336426s.
+[triton-dejavu] First execution including JIT compilation took 0.2524149417877197s.
+[triton-dejavu] First execution including JIT compilation took 0.42508769035339355s.
+[triton-dejavu] First execution including JIT compilation took 0.3460569381713867s.
+[triton-dejavu] First execution including JIT compilation took 0.26442742347717285s.
+[triton-dejavu] First execution including JIT compilation took 0.46298742294311523s.
+[triton-dejavu] First execution including JIT compilation took 0.33917737007141113s.
+[triton-dejavu] First execution including JIT compilation took 0.2681269645690918s.
+[triton-dejavu] First execution including JIT compilation took 0.48372411727905273s.
+[triton-dejavu] First execution including JIT compilation took 0.34528517723083496s.
+[triton-dejavu] First execution including JIT compilation took 0.27705836296081543s.
+[triton-dejavu] First execution including JIT compilation took 0.5136411190032959s.
+[triton-dejavu] First execution including JIT compilation took 0.35900115966796875s.
+[triton-dejavu] First execution including JIT compilation took 0.27854084968566895s.
+[triton-dejavu] First execution including JIT compilation took 0.4344968795776367s.
+[triton-dejavu] First execution including JIT compilation took 0.29988908767700195s.
+[triton-dejavu] First execution including JIT compilation took 0.4305758476257324s.
+[triton-dejavu] First execution including JIT compilation took 0.4533987045288086s.
+[triton-dejavu] First execution including JIT compilation took 0.535660982131958s.
+[triton-dejavu] First execution including JIT compilation took 0.2640557289123535s.
+[triton-dejavu] First execution including JIT compilation took 0.7827637195587158s.
+[triton-dejavu] First execution including JIT compilation took 0.4749734401702881s.
+[triton-dejavu] First execution including JIT compilation took 0.28125476837158203s.
+[triton-dejavu] First execution including JIT compilation took 0.5667471885681152s.
+[triton-dejavu] First execution including JIT compilation took 0.3628854751586914s.
+[triton-dejavu] First execution including JIT compilation took 0.27803826332092285s.
+[triton-dejavu] First execution including JIT compilation took 0.5935788154602051s.
+[triton-dejavu] First execution including JIT compilation took 0.37410998344421387s.
+[triton-dejavu] First execution including JIT compilation took 0.315047025680542s.
+[triton-dejavu] First execution including JIT compilation took 0.6383876800537109s.
+[triton-dejavu] First execution including JIT compilation took 0.39231395721435547s.
+[triton-dejavu] First execution including JIT compilation took 0.4904477596282959s.
+[triton-dejavu] First execution including JIT compilation took 0.7176785469055176s.
+[triton-dejavu] First execution including JIT compilation took 0.8923492431640625s.
+[triton-dejavu] First execution including JIT compilation took 0.37270665168762207s.
+[triton-dejavu] First execution including JIT compilation took 0.7702887058258057s.
+[triton-dejavu] First execution including JIT compilation took 0.39134764671325684s.
+[triton-dejavu] First execution including JIT compilation took 0.27783751487731934s.
+[triton-dejavu] First execution including JIT compilation took 0.7178552150726318s.
+[triton-dejavu] First execution including JIT compilation took 0.5033924579620361s.
+[triton-dejavu] First execution including JIT compilation took 0.29184746742248535s.
+[triton-dejavu] First execution including JIT compilation took 1.4417552947998047s.
+[triton-dejavu] First execution including JIT compilation took 0.455214262008667s.
+[triton-dejavu] First execution including JIT compilation took 0.32988977432250977s.
+[triton-dejavu] First execution including JIT compilation took 1.2478272914886475s.
+[triton-dejavu] First execution including JIT compilation took 0.8535811901092529s.
+[triton-dejavu] First execution including JIT compilation took 0.4271533489227295s.
+[triton-dejavu] First execution including JIT compilation took 1.5763370990753174s.
+[triton-dejavu] First execution including JIT compilation took 0.6108431816101074s.
+[triton-dejavu] First execution including JIT compilation took 0.3532085418701172s.
+[triton-dejavu] First execution including JIT compilation took 1.6405696868896484s.
+[triton-dejavu] First execution including JIT compilation took 0.5536832809448242s.
+[triton-dejavu] First execution including JIT compilation took 0.36753392219543457s.
+bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.3316895961761475s.
+[triton-dejavu] First execution including JIT compilation took 0.578660249710083s.
+[triton-dejavu] First execution including JIT compilation took 0.3567483425140381s.
+[triton-dejavu] First execution including JIT compilation took 1.5676116943359375s.
+[triton-dejavu] First execution including JIT compilation took 0.5794088840484619s.
+[triton-dejavu] First execution including JIT compilation took 0.3735392093658447s.
+[triton-dejavu] First execution including JIT compilation took 5.502956390380859s.
+[triton-dejavu] First execution including JIT compilation took 1.0270774364471436s.
+[triton-dejavu] First execution including JIT compilation took 0.442889928817749s.
+[triton-dejavu] First execution including JIT compilation took 5.70585036277771s.
+bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 3.93192195892334s.
+[triton-dejavu] First execution including JIT compilation took 1.0750982761383057s.
+[triton-dejavu] First execution including JIT compilation took 0.5941033363342285s.
+[triton-dejavu] First execution including JIT compilation took 4.812488079071045s.
+bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 835584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 835584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 835584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 835584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1114112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1114112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1114112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1114112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.5393879413604736s.
+[triton-dejavu] First execution including JIT compilation took 0.36963605880737305s.
+[triton-dejavu] First execution including JIT compilation took 0.3970627784729004s.
+[triton-dejavu] First execution including JIT compilation took 0.4841430187225342s.
+[triton-dejavu] First execution including JIT compilation took 0.3199918270111084s.
+[triton-dejavu] First execution including JIT compilation took 0.2800755500793457s.
+[triton-dejavu] First execution including JIT compilation took 0.5644237995147705s.
+[triton-dejavu] First execution including JIT compilation took 0.3869204521179199s.
+[triton-dejavu] First execution including JIT compilation took 0.4037020206451416s.
+[triton-dejavu] First execution including JIT compilation took 0.5500500202178955s.
+[triton-dejavu] First execution including JIT compilation took 0.3945121765136719s.
+[triton-dejavu] First execution including JIT compilation took 0.3146946430206299s.
+[triton-dejavu] First execution including JIT compilation took 0.5734715461730957s.
+[triton-dejavu] First execution including JIT compilation took 0.5372509956359863s.
+[triton-dejavu] First execution including JIT compilation took 0.3640165328979492s.
+[triton-dejavu] First execution including JIT compilation took 0.6109771728515625s.
+[triton-dejavu] First execution including JIT compilation took 0.4634361267089844s.
+[triton-dejavu] First execution including JIT compilation took 0.4206717014312744s.
+[triton-dejavu] First execution including JIT compilation took 1.0486819744110107s.
+[triton-dejavu] First execution including JIT compilation took 0.44484424591064453s.
+[triton-dejavu] First execution including JIT compilation took 0.3491060733795166s.
+[triton-dejavu] First execution including JIT compilation took 0.7697179317474365s.
+[triton-dejavu] First execution including JIT compilation took 0.3961319923400879s.
+[triton-dejavu] First execution including JIT compilation took 0.3008708953857422s.
+[triton-dejavu] First execution including JIT compilation took 0.6616361141204834s.
+[triton-dejavu] First execution including JIT compilation took 0.45753026008605957s.
+[triton-dejavu] First execution including JIT compilation took 0.3097813129425049s.
+[triton-dejavu] First execution including JIT compilation took 0.7761518955230713s.
+[triton-dejavu] First execution including JIT compilation took 0.5004098415374756s.
+[triton-dejavu] First execution including JIT compilation took 0.3134744167327881s.
+[triton-dejavu] First execution including JIT compilation took 0.7714171409606934s.
+[triton-dejavu] First execution including JIT compilation took 0.7993361949920654s.
+[triton-dejavu] First execution including JIT compilation took 0.34277820587158203s.
+[triton-dejavu] First execution including JIT compilation took 0.808971643447876s.
+[triton-dejavu] First execution including JIT compilation took 0.4371776580810547s.
+[triton-dejavu] First execution including JIT compilation took 0.31221866607666016s.
+[triton-dejavu] First execution including JIT compilation took 0.6809587478637695s.
+[triton-dejavu] First execution including JIT compilation took 0.40524864196777344s.
+[triton-dejavu] First execution including JIT compilation took 0.49398159980773926s.
+[triton-dejavu] First execution including JIT compilation took 0.7367451190948486s.
+[triton-dejavu] First execution including JIT compilation took 0.7439749240875244s.
+[triton-dejavu] First execution including JIT compilation took 0.3696317672729492s.
+[triton-dejavu] First execution including JIT compilation took 1.1181640625s.
+[triton-dejavu] First execution including JIT compilation took 0.4313173294067383s.
+[triton-dejavu] First execution including JIT compilation took 0.297299861907959s.
+[triton-dejavu] First execution including JIT compilation took 0.8869140148162842s.
+[triton-dejavu] First execution including JIT compilation took 0.48682713508605957s.
+[triton-dejavu] First execution including JIT compilation took 0.3501567840576172s.
+[triton-dejavu] First execution including JIT compilation took 1.4581646919250488s.
+[triton-dejavu] First execution including JIT compilation took 0.5649135112762451s.
+[triton-dejavu] First execution including JIT compilation took 0.3721659183502197s.
+[triton-dejavu] First execution including JIT compilation took 1.5119690895080566s.
+[triton-dejavu] First execution including JIT compilation took 0.5899574756622314s.
+[triton-dejavu] First execution including JIT compilation took 0.3819904327392578s.
+[triton-dejavu] First execution including JIT compilation took 1.6209561824798584s.
+[triton-dejavu] First execution including JIT compilation took 0.6263985633850098s.
+[triton-dejavu] First execution including JIT compilation took 0.38887882232666016s.
+[triton-dejavu] First execution including JIT compilation took 1.7282218933105469s.
+[triton-dejavu] First execution including JIT compilation took 0.6377005577087402s.
+[triton-dejavu] First execution including JIT compilation took 0.4078361988067627s.
+bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.6659209728240967s.
+[triton-dejavu] First execution including JIT compilation took 0.9628505706787109s.
+[triton-dejavu] First execution including JIT compilation took 0.4381530284881592s.
+[triton-dejavu] First execution including JIT compilation took 1.6766464710235596s.
+[triton-dejavu] First execution including JIT compilation took 0.7337453365325928s.
+[triton-dejavu] First execution including JIT compilation took 0.673093318939209s.
+[triton-dejavu] First execution including JIT compilation took 7.029362678527832s.
+[triton-dejavu] First execution including JIT compilation took 1.219388484954834s.
+[triton-dejavu] First execution including JIT compilation took 0.8028266429901123s.
+[triton-dejavu] First execution including JIT compilation took 6.798900127410889s.
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 5.058229684829712s.
+[triton-dejavu] First execution including JIT compilation took 1.5925123691558838s.
+[triton-dejavu] First execution including JIT compilation took 0.6987450122833252s.
+[triton-dejavu] First execution including JIT compilation took 5.16088080406189s.
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 884736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 884736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 884736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 884736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.8672266006469727s.
+[triton-dejavu] First execution including JIT compilation took 0.4460761547088623s.
+[triton-dejavu] First execution including JIT compilation took 0.2941617965698242s.
+[triton-dejavu] First execution including JIT compilation took 0.7295150756835938s.
+[triton-dejavu] First execution including JIT compilation took 0.418994665145874s.
+[triton-dejavu] First execution including JIT compilation took 0.3657665252685547s.
+[triton-dejavu] First execution including JIT compilation took 0.8384854793548584s.
+[triton-dejavu] First execution including JIT compilation took 0.5150895118713379s.
+[triton-dejavu] First execution including JIT compilation took 0.3905613422393799s.
+[triton-dejavu] First execution including JIT compilation took 0.9925787448883057s.
+[triton-dejavu] First execution including JIT compilation took 0.5739874839782715s.
+[triton-dejavu] First execution including JIT compilation took 0.3871347904205322s.
+[triton-dejavu] First execution including JIT compilation took 1.0117156505584717s.
+[triton-dejavu] First execution including JIT compilation took 0.5855865478515625s.
+[triton-dejavu] First execution including JIT compilation took 0.3940417766571045s.
+[triton-dejavu] First execution including JIT compilation took 1.0509228706359863s.
+[triton-dejavu] First execution including JIT compilation took 0.6001262664794922s.
+[triton-dejavu] First execution including JIT compilation took 0.4003324508666992s.
+[triton-dejavu] First execution including JIT compilation took 1.0769095420837402s.
+[triton-dejavu] First execution including JIT compilation took 0.6213738918304443s.
+[triton-dejavu] First execution including JIT compilation took 0.4390685558319092s.
+[triton-dejavu] First execution including JIT compilation took 1.0989954471588135s.
+[triton-dejavu] First execution including JIT compilation took 0.538212776184082s.
+[triton-dejavu] First execution including JIT compilation took 0.34635400772094727s.
+[triton-dejavu] First execution including JIT compilation took 1.1051290035247803s.
+[triton-dejavu] First execution including JIT compilation took 0.6306774616241455s.
+[triton-dejavu] First execution including JIT compilation took 0.3778243064880371s.
+[triton-dejavu] First execution including JIT compilation took 1.2004315853118896s.
+[triton-dejavu] First execution including JIT compilation took 0.8137209415435791s.
+[triton-dejavu] First execution including JIT compilation took 0.38579225540161133s.
+[triton-dejavu] First execution including JIT compilation took 1.2420098781585693s.
+[triton-dejavu] First execution including JIT compilation took 0.6466991901397705s.
+[triton-dejavu] First execution including JIT compilation took 0.36069154739379883s.
+[triton-dejavu] First execution including JIT compilation took 1.1935985088348389s.
+[triton-dejavu] First execution including JIT compilation took 0.7577130794525146s.
+[triton-dejavu] First execution including JIT compilation took 0.4502859115600586s.
+[triton-dejavu] First execution including JIT compilation took 1.3447489738464355s.
+[triton-dejavu] First execution including JIT compilation took 0.9920356273651123s.
+[triton-dejavu] First execution including JIT compilation took 0.431868314743042s.
+[triton-dejavu] First execution including JIT compilation took 1.4089694023132324s.
+[triton-dejavu] First execution including JIT compilation took 0.6866104602813721s.
+[triton-dejavu] First execution including JIT compilation took 0.445110559463501s.
+[triton-dejavu] First execution including JIT compilation took 1.3942172527313232s.
+[triton-dejavu] First execution including JIT compilation took 0.5355641841888428s.
+[triton-dejavu] First execution including JIT compilation took 0.32257676124572754s.
+[triton-dejavu] First execution including JIT compilation took 1.2880756855010986s.
+[triton-dejavu] First execution including JIT compilation took 0.6331043243408203s.
+[triton-dejavu] First execution including JIT compilation took 0.3692941665649414s.
+[triton-dejavu] First execution including JIT compilation took 1.7392678260803223s.
+[triton-dejavu] First execution including JIT compilation took 0.649709939956665s.
+[triton-dejavu] First execution including JIT compilation took 0.38323354721069336s.
+[triton-dejavu] First execution including JIT compilation took 1.8752937316894531s.
+[triton-dejavu] First execution including JIT compilation took 0.724346399307251s.
+[triton-dejavu] First execution including JIT compilation took 0.37693119049072266s.
+[triton-dejavu] First execution including JIT compilation took 2.0627846717834473s.
+[triton-dejavu] First execution including JIT compilation took 0.7618973255157471s.
+[triton-dejavu] First execution including JIT compilation took 0.4468967914581299s.
+[triton-dejavu] First execution including JIT compilation took 2.021761178970337s.
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.558701276779175s.
+[triton-dejavu] First execution including JIT compilation took 0.8006973266601562s.
+[triton-dejavu] First execution including JIT compilation took 0.4271361827850342s.
+[triton-dejavu] First execution including JIT compilation took 2.6522200107574463s.
+[triton-dejavu] First execution including JIT compilation took 0.860508918762207s.
+[triton-dejavu] First execution including JIT compilation took 0.4831836223602295s.
+[triton-dejavu] First execution including JIT compilation took 7.42484712600708s.
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 6.010982990264893s.
+[triton-dejavu] First execution including JIT compilation took 1.7582054138183594s.
+[triton-dejavu] First execution including JIT compilation took 1.0528242588043213s.
+[triton-dejavu] First execution including JIT compilation took 6.84581995010376s.
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1146880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1146880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.7142207622528076s.
+[triton-dejavu] First execution including JIT compilation took 0.8741211891174316s.
+[triton-dejavu] First execution including JIT compilation took 0.44619178771972656s.
+[triton-dejavu] First execution including JIT compilation took 1.8718609809875488s.
+[triton-dejavu] First execution including JIT compilation took 0.9042544364929199s.
+[triton-dejavu] First execution including JIT compilation took 0.4581465721130371s.
+[triton-dejavu] First execution including JIT compilation took 2.1042685508728027s.
+[triton-dejavu] First execution including JIT compilation took 0.908367395401001s.
+[triton-dejavu] First execution including JIT compilation took 0.48277711868286133s.
+[triton-dejavu] First execution including JIT compilation took 1.7529594898223877s.
+[triton-dejavu] First execution including JIT compilation took 0.9210634231567383s.
+[triton-dejavu] First execution including JIT compilation took 0.5785129070281982s.
+[triton-dejavu] First execution including JIT compilation took 1.9719526767730713s.
+[triton-dejavu] First execution including JIT compilation took 0.926983118057251s.
+[triton-dejavu] First execution including JIT compilation took 0.47329115867614746s.
+[triton-dejavu] First execution including JIT compilation took 1.8675498962402344s.
+[triton-dejavu] First execution including JIT compilation took 0.8849301338195801s.
+[triton-dejavu] First execution including JIT compilation took 0.4898045063018799s.
+[triton-dejavu] First execution including JIT compilation took 1.819542407989502s.
+[triton-dejavu] First execution including JIT compilation took 0.981731653213501s.
+[triton-dejavu] First execution including JIT compilation took 0.5096790790557861s.
+[triton-dejavu] First execution including JIT compilation took 2.11425518989563s.
+[triton-dejavu] First execution including JIT compilation took 0.837721586227417s.
+[triton-dejavu] First execution including JIT compilation took 0.4882984161376953s.
+[triton-dejavu] First execution including JIT compilation took 2.053067922592163s.
+[triton-dejavu] First execution including JIT compilation took 0.897794246673584s.
+[triton-dejavu] First execution including JIT compilation took 0.4767446517944336s.
+[triton-dejavu] First execution including JIT compilation took 2.07883358001709s.
+[triton-dejavu] First execution including JIT compilation took 1.0238347053527832s.
+[triton-dejavu] First execution including JIT compilation took 0.6266560554504395s.
+[triton-dejavu] First execution including JIT compilation took 2.814924478530884s.
+[triton-dejavu] First execution including JIT compilation took 1.255967378616333s.
+[triton-dejavu] First execution including JIT compilation took 0.680903434753418s.
+[triton-dejavu] First execution including JIT compilation took 2.395393133163452s.
+[triton-dejavu] First execution including JIT compilation took 1.0010457038879395s.
+[triton-dejavu] First execution including JIT compilation took 0.6347818374633789s.
+[triton-dejavu] First execution including JIT compilation took 2.7960519790649414s.
+[triton-dejavu] First execution including JIT compilation took 1.0326149463653564s.
+[triton-dejavu] First execution including JIT compilation took 0.5450241565704346s.
+[triton-dejavu] First execution including JIT compilation took 2.445779800415039s.
+[triton-dejavu] First execution including JIT compilation took 1.0319764614105225s.
+[triton-dejavu] First execution including JIT compilation took 0.6632704734802246s.
+[triton-dejavu] First execution including JIT compilation took 2.80086088180542s.
+[triton-dejavu] First execution including JIT compilation took 1.1742348670959473s.
+[triton-dejavu] First execution including JIT compilation took 0.5098991394042969s.
+[triton-dejavu] First execution including JIT compilation took 2.790087938308716s.
+[triton-dejavu] First execution including JIT compilation took 1.1971583366394043s.
+[triton-dejavu] First execution including JIT compilation took 0.5753312110900879s.
+[triton-dejavu] First execution including JIT compilation took 3.8199825286865234s.
+[triton-dejavu] First execution including JIT compilation took 1.5596168041229248s.
+[triton-dejavu] First execution including JIT compilation took 0.7234528064727783s.
+[triton-dejavu] First execution including JIT compilation took 3.8001348972320557s.
+[triton-dejavu] First execution including JIT compilation took 1.3300747871398926s.
+[triton-dejavu] First execution including JIT compilation took 0.8064060211181641s.
+[triton-dejavu] First execution including JIT compilation took 3.833221673965454s.
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 4.838615655899048s.
+[triton-dejavu] First execution including JIT compilation took 1.5911104679107666s.
+[triton-dejavu] First execution including JIT compilation took 0.7249307632446289s.
+[triton-dejavu] First execution including JIT compilation took 5.080144166946411s.
+[triton-dejavu] First execution including JIT compilation took 1.7896246910095215s.
+[triton-dejavu] First execution including JIT compilation took 0.7319927215576172s.
+[triton-dejavu] First execution including JIT compilation took 10.777840614318848s.
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+[triton-dejavu] First execution including JIT compilation took 5.033360242843628s.
+[triton-dejavu] First execution including JIT compilation took 1.410045862197876s.
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1376256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1376256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 4.680861473083496s.
+[triton-dejavu] First execution including JIT compilation took 2.364461898803711s.
+[triton-dejavu] First execution including JIT compilation took 0.8534502983093262s.
+[triton-dejavu] First execution including JIT compilation took 4.708017349243164s.
+[triton-dejavu] First execution including JIT compilation took 2.841503858566284s.
+[triton-dejavu] First execution including JIT compilation took 1.0484719276428223s.
+[triton-dejavu] First execution including JIT compilation took 4.7807886600494385s.
+[triton-dejavu] First execution including JIT compilation took 2.8980062007904053s.
+[triton-dejavu] First execution including JIT compilation took 1.0707988739013672s.
+[triton-dejavu] First execution including JIT compilation took 4.607600212097168s.
+[triton-dejavu] First execution including JIT compilation took 2.8636832237243652s.
+[triton-dejavu] First execution including JIT compilation took 1.1431879997253418s.
+[triton-dejavu] First execution including JIT compilation took 4.923970699310303s.
+[triton-dejavu] First execution including JIT compilation took 2.79614520072937s.
+[triton-dejavu] First execution including JIT compilation took 1.0749492645263672s.
+[triton-dejavu] First execution including JIT compilation took 4.696893692016602s.
+[triton-dejavu] First execution including JIT compilation took 2.8622703552246094s.
+[triton-dejavu] First execution including JIT compilation took 1.0982391834259033s.
+[triton-dejavu] First execution including JIT compilation took 4.7404444217681885s.
+[triton-dejavu] First execution including JIT compilation took 2.878173828125s.
+[triton-dejavu] First execution including JIT compilation took 1.1065995693206787s.
+[triton-dejavu] First execution including JIT compilation took 4.991016626358032s.
+[triton-dejavu] First execution including JIT compilation took 2.5021591186523438s.
+[triton-dejavu] First execution including JIT compilation took 0.9695248603820801s.
+[triton-dejavu] First execution including JIT compilation took 5.3018670082092285s.
+[triton-dejavu] First execution including JIT compilation took 3.273489236831665s.
+[triton-dejavu] First execution including JIT compilation took 1.181260108947754s.
+[triton-dejavu] First execution including JIT compilation took 5.431257247924805s.
+[triton-dejavu] First execution including JIT compilation took 3.352473497390747s.
+[triton-dejavu] First execution including JIT compilation took 1.186856985092163s.
+[triton-dejavu] First execution including JIT compilation took 5.393920183181763s.
+[triton-dejavu] First execution including JIT compilation took 3.40191650390625s.
+[triton-dejavu] First execution including JIT compilation took 1.1941492557525635s.
+[triton-dejavu] First execution including JIT compilation took 5.543420314788818s.
+[triton-dejavu] First execution including JIT compilation took 3.3016717433929443s.
+[triton-dejavu] First execution including JIT compilation took 1.2081632614135742s.
+[triton-dejavu] First execution including JIT compilation took 5.640880107879639s.
+[triton-dejavu] First execution including JIT compilation took 3.5443694591522217s.
+[triton-dejavu] First execution including JIT compilation took 1.3958439826965332s.
+[triton-dejavu] First execution including JIT compilation took 5.6015305519104s.
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 6.935962200164795s.
+[triton-dejavu] First execution including JIT compilation took 3.3080406188964844s.
+[triton-dejavu] First execution including JIT compilation took 1.2709336280822754s.
+[triton-dejavu] First execution including JIT compilation took 7.072402715682983s.
+[triton-dejavu] First execution including JIT compilation took 3.7861485481262207s.
+[triton-dejavu] First execution including JIT compilation took 1.4361011981964111s.
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+[triton-dejavu] First execution including JIT compilation took 5.464360475540161s.
+[triton-dejavu] First execution including JIT compilation took 1.6160335540771484s.
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+[triton-dejavu] First execution including JIT compilation took 5.580164670944214s.
+[triton-dejavu] First execution including JIT compilation took 2.2763874530792236s.
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 917504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 917504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1835008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1835008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2097152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2097152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2097152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2097152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] added BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None for _bmm_chunk_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default and key ('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')
+[2025-07-23 17:21:31] Triton autotuning for function _bmm_chunk_fwd_kernel finished after 10756.57s; best config selected: BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None with benchmark time 0.002230335958302021;  evaluated 2625 configurations;
+[triton-dejavu] ('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16') not in cache, starting to tune...
+[triton-dejavu] [2025-07-23 17:21:31]  Started benchmarking of 2625 configurations... (use_bo: False, run: 0)
+[triton-dejavu] First execution including JIT compilation took 0.30918288230895996s.
+[triton-dejavu] First execution including JIT compilation took 0.2933952808380127s.
+[triton-dejavu] First execution including JIT compilation took 0.25191783905029297s.
+[triton-dejavu] First execution including JIT compilation took 0.3722970485687256s.
+[triton-dejavu] First execution including JIT compilation took 0.29223203659057617s.
+[triton-dejavu] First execution including JIT compilation took 0.2751960754394531s.
+[triton-dejavu] First execution including JIT compilation took 0.33064961433410645s.
+[triton-dejavu] First execution including JIT compilation took 0.3094618320465088s.
+[triton-dejavu] First execution including JIT compilation took 0.28058433532714844s.
+[triton-dejavu] First execution including JIT compilation took 0.4254286289215088s.
+[triton-dejavu] First execution including JIT compilation took 0.40548038482666016s.
+[triton-dejavu] First execution including JIT compilation took 0.3699519634246826s.
+[triton-dejavu] First execution including JIT compilation took 0.44558167457580566s.
+[triton-dejavu] First execution including JIT compilation took 0.4306924343109131s.
+[triton-dejavu] First execution including JIT compilation took 0.39330148696899414s.
+[triton-dejavu] First execution including JIT compilation took 0.46187448501586914s.
+[triton-dejavu] First execution including JIT compilation took 0.42760252952575684s.
+[triton-dejavu] First execution including JIT compilation took 0.3771791458129883s.
+[triton-dejavu] First execution including JIT compilation took 0.4732646942138672s.
+[triton-dejavu] First execution including JIT compilation took 0.45160865783691406s.
+[triton-dejavu] First execution including JIT compilation took 0.4094536304473877s.
+[triton-dejavu] First execution including JIT compilation took 0.3954737186431885s.
+[triton-dejavu] First execution including JIT compilation took 0.3627440929412842s.
+[triton-dejavu] First execution including JIT compilation took 0.3488888740539551s.
+[triton-dejavu] First execution including JIT compilation took 0.43737292289733887s.
+[triton-dejavu] First execution including JIT compilation took 0.4030463695526123s.
+[triton-dejavu] First execution including JIT compilation took 0.3850533962249756s.
+[triton-dejavu] First execution including JIT compilation took 0.5620872974395752s.
+[triton-dejavu] First execution including JIT compilation took 0.43325042724609375s.
+[triton-dejavu] First execution including JIT compilation took 0.39774608612060547s.
+[triton-dejavu] First execution including JIT compilation took 0.4676651954650879s.
+[triton-dejavu] First execution including JIT compilation took 0.4356362819671631s.
+[triton-dejavu] First execution including JIT compilation took 0.601660966873169s.
+[triton-dejavu] First execution including JIT compilation took 0.4965968132019043s.
+[triton-dejavu] First execution including JIT compilation took 0.43747496604919434s.
+[triton-dejavu] First execution including JIT compilation took 0.4154806137084961s.
+[triton-dejavu] First execution including JIT compilation took 0.4810163974761963s.
+[triton-dejavu] First execution including JIT compilation took 0.5712161064147949s.
+[triton-dejavu] First execution including JIT compilation took 0.42205166816711426s.
+[triton-dejavu] First execution including JIT compilation took 0.5164515972137451s.
+[triton-dejavu] First execution including JIT compilation took 0.46834373474121094s.
+[triton-dejavu] First execution including JIT compilation took 0.6180143356323242s.
+[triton-dejavu] First execution including JIT compilation took 0.44220447540283203s.
+[triton-dejavu] First execution including JIT compilation took 0.38592958450317383s.
+[triton-dejavu] First execution including JIT compilation took 0.36253952980041504s.
+[triton-dejavu] First execution including JIT compilation took 0.5079498291015625s.
+[triton-dejavu] First execution including JIT compilation took 0.42972564697265625s.
+[triton-dejavu] First execution including JIT compilation took 0.44005632400512695s.
+[triton-dejavu] First execution including JIT compilation took 0.5287299156188965s.
+[triton-dejavu] First execution including JIT compilation took 0.44796276092529297s.
+[triton-dejavu] First execution including JIT compilation took 0.4164867401123047s.
+[triton-dejavu] First execution including JIT compilation took 0.6508886814117432s.
+[triton-dejavu] First execution including JIT compilation took 0.45914721488952637s.
+[triton-dejavu] First execution including JIT compilation took 0.41272830963134766s.
+[triton-dejavu] First execution including JIT compilation took 0.5432581901550293s.
+[triton-dejavu] First execution including JIT compilation took 0.47520899772644043s.
+[triton-dejavu] First execution including JIT compilation took 0.6470954418182373s.
+[triton-dejavu] First execution including JIT compilation took 0.5783417224884033s.
+[triton-dejavu] First execution including JIT compilation took 0.4790058135986328s.
+[triton-dejavu] First execution including JIT compilation took 0.4519171714782715s.
+[triton-dejavu] First execution including JIT compilation took 0.6798951625823975s.
+[triton-dejavu] First execution including JIT compilation took 0.5275766849517822s.
+[triton-dejavu] First execution including JIT compilation took 0.4806857109069824s.
+[triton-dejavu] First execution including JIT compilation took 0.5353171825408936s.
+[triton-dejavu] First execution including JIT compilation took 0.46978282928466797s.
+[triton-dejavu] First execution including JIT compilation took 0.39371633529663086s.
+[triton-dejavu] First execution including JIT compilation took 0.5798892974853516s.
+[triton-dejavu] First execution including JIT compilation took 0.46941256523132324s.
+[triton-dejavu] First execution including JIT compilation took 0.42421650886535645s.
+[triton-dejavu] First execution including JIT compilation took 0.7001688480377197s.
+[triton-dejavu] First execution including JIT compilation took 0.48505401611328125s.
+[triton-dejavu] First execution including JIT compilation took 0.43085551261901855s.
+[triton-dejavu] First execution including JIT compilation took 0.6240184307098389s.
+[triton-dejavu] First execution including JIT compilation took 0.5429472923278809s.
+[triton-dejavu] First execution including JIT compilation took 0.456082820892334s.
+[triton-dejavu] First execution including JIT compilation took 0.6970863342285156s.
+[triton-dejavu] First execution including JIT compilation took 0.5338778495788574s.
+[triton-dejavu] First execution including JIT compilation took 0.5650749206542969s.
+[triton-dejavu] First execution including JIT compilation took 0.7317397594451904s.
+[triton-dejavu] First execution including JIT compilation took 0.5815334320068359s.
+[triton-dejavu] First execution including JIT compilation took 0.5088152885437012s.
+[triton-dejavu] First execution including JIT compilation took 0.8137404918670654s.
+[triton-dejavu] First execution including JIT compilation took 0.6120672225952148s.
+[triton-dejavu] First execution including JIT compilation took 0.5259246826171875s.
+[triton-dejavu] First execution including JIT compilation took 0.7127907276153564s.
+[triton-dejavu] First execution including JIT compilation took 0.5280823707580566s.
+[triton-dejavu] First execution including JIT compilation took 0.44465160369873047s.
+[triton-dejavu] First execution including JIT compilation took 0.8294477462768555s.
+[triton-dejavu] First execution including JIT compilation took 0.5809340476989746s.
+[triton-dejavu] First execution including JIT compilation took 0.5024135112762451s.
+[triton-dejavu] First execution including JIT compilation took 0.8774230480194092s.
+[triton-dejavu] First execution including JIT compilation took 0.7163739204406738s.
+[triton-dejavu] First execution including JIT compilation took 0.49521970748901367s.
+[triton-dejavu] First execution including JIT compilation took 0.9411158561706543s.
+[triton-dejavu] First execution including JIT compilation took 0.681549072265625s.
+[triton-dejavu] First execution including JIT compilation took 0.7356657981872559s.
+[triton-dejavu] First execution including JIT compilation took 1.034416913986206s.
+[triton-dejavu] First execution including JIT compilation took 0.731208324432373s.
+[triton-dejavu] First execution including JIT compilation took 0.5860607624053955s.
+[triton-dejavu] First execution including JIT compilation took 1.1468729972839355s.
+[triton-dejavu] First execution including JIT compilation took 0.7314703464508057s.
+[triton-dejavu] First execution including JIT compilation took 0.5886971950531006s.
+[triton-dejavu] First execution including JIT compilation took 1.10198974609375s.
+[triton-dejavu] First execution including JIT compilation took 0.7672359943389893s.
+[triton-dejavu] First execution including JIT compilation took 0.6245870590209961s.
+[triton-dejavu] First execution including JIT compilation took 0.4356365203857422s.
+[triton-dejavu] First execution including JIT compilation took 0.33637166023254395s.
+[triton-dejavu] First execution including JIT compilation took 0.32270336151123047s.
+[triton-dejavu] First execution including JIT compilation took 0.43533897399902344s.
+[triton-dejavu] First execution including JIT compilation took 0.37600111961364746s.
+[triton-dejavu] First execution including JIT compilation took 0.39075493812561035s.
+[triton-dejavu] First execution including JIT compilation took 0.5955746173858643s.
+[triton-dejavu] First execution including JIT compilation took 0.420551061630249s.
+[triton-dejavu] First execution including JIT compilation took 0.38468456268310547s.
+[triton-dejavu] First execution including JIT compilation took 0.48545384407043457s.
+[triton-dejavu] First execution including JIT compilation took 0.43158817291259766s.
+[triton-dejavu] First execution including JIT compilation took 0.42005348205566406s.
+[triton-dejavu] First execution including JIT compilation took 0.5997962951660156s.
+[triton-dejavu] First execution including JIT compilation took 0.4429283142089844s.
+[triton-dejavu] First execution including JIT compilation took 0.40537381172180176s.
+[triton-dejavu] First execution including JIT compilation took 0.5108773708343506s.
+[triton-dejavu] First execution including JIT compilation took 0.4490795135498047s.
+[triton-dejavu] First execution including JIT compilation took 0.4208858013153076s.
+[triton-dejavu] First execution including JIT compilation took 0.6792380809783936s.
+[triton-dejavu] First execution including JIT compilation took 0.467818021774292s.
+[triton-dejavu] First execution including JIT compilation took 0.4417719841003418s.
+[triton-dejavu] First execution including JIT compilation took 0.4436028003692627s.
+[triton-dejavu] First execution including JIT compilation took 0.3732438087463379s.
+[triton-dejavu] First execution including JIT compilation took 0.3606081008911133s.
+[triton-dejavu] First execution including JIT compilation took 0.4783363342285156s.
+[triton-dejavu] First execution including JIT compilation took 0.40464305877685547s.
+[triton-dejavu] First execution including JIT compilation took 0.38185811042785645s.
+[triton-dejavu] First execution including JIT compilation took 0.5133819580078125s.
+[triton-dejavu] First execution including JIT compilation took 0.43381595611572266s.
+[triton-dejavu] First execution including JIT compilation took 0.42664098739624023s.
+[triton-dejavu] First execution including JIT compilation took 0.5179893970489502s.
+[triton-dejavu] First execution including JIT compilation took 0.46022605895996094s.
+[triton-dejavu] First execution including JIT compilation took 0.4134035110473633s.
+[triton-dejavu] First execution including JIT compilation took 0.5401298999786377s.
+[triton-dejavu] First execution including JIT compilation took 0.4478724002838135s.
+[triton-dejavu] First execution including JIT compilation took 0.42383623123168945s.
+[triton-dejavu] First execution including JIT compilation took 0.5332937240600586s.
+[triton-dejavu] First execution including JIT compilation took 0.49991607666015625s.
+[triton-dejavu] First execution including JIT compilation took 0.41617631912231445s.
+[triton-dejavu] First execution including JIT compilation took 0.601402759552002s.
+[triton-dejavu] First execution including JIT compilation took 0.4842853546142578s.
+[triton-dejavu] First execution including JIT compilation took 0.4743378162384033s.
+[triton-dejavu] First execution including JIT compilation took 0.4953763484954834s.
+[triton-dejavu] First execution including JIT compilation took 0.4030179977416992s.
+[triton-dejavu] First execution including JIT compilation took 0.37740230560302734s.
+[triton-dejavu] First execution including JIT compilation took 0.5496475696563721s.
+[triton-dejavu] First execution including JIT compilation took 0.4513847827911377s.
+[triton-dejavu] First execution including JIT compilation took 0.43414807319641113s.
+[triton-dejavu] First execution including JIT compilation took 0.5767252445220947s.
+[triton-dejavu] First execution including JIT compilation took 0.4709329605102539s.
+[triton-dejavu] First execution including JIT compilation took 0.41281580924987793s.
+[triton-dejavu] First execution including JIT compilation took 0.6148693561553955s.
+[triton-dejavu] First execution including JIT compilation took 0.6631579399108887s.
+[triton-dejavu] First execution including JIT compilation took 0.42415881156921387s.
+[triton-dejavu] First execution including JIT compilation took 0.6116061210632324s.
+[triton-dejavu] First execution including JIT compilation took 0.5052735805511475s.
+[triton-dejavu] First execution including JIT compilation took 0.42380595207214355s.
+[triton-dejavu] First execution including JIT compilation took 0.9918599128723145s.
+[triton-dejavu] First execution including JIT compilation took 0.4996817111968994s.
+[triton-dejavu] First execution including JIT compilation took 0.44182300567626953s.
+[triton-dejavu] First execution including JIT compilation took 0.69724440574646s.
+[triton-dejavu] First execution including JIT compilation took 0.6008899211883545s.
+[triton-dejavu] First execution including JIT compilation took 0.48003244400024414s.
+[triton-dejavu] First execution including JIT compilation took 0.6034128665924072s.
+[triton-dejavu] First execution including JIT compilation took 0.49086999893188477s.
+[triton-dejavu] First execution including JIT compilation took 0.40519237518310547s.
+[triton-dejavu] First execution including JIT compilation took 0.6755588054656982s.
+[triton-dejavu] First execution including JIT compilation took 0.48955368995666504s.
+[triton-dejavu] First execution including JIT compilation took 0.4392104148864746s.
+[triton-dejavu] First execution including JIT compilation took 0.7415237426757812s.
+[triton-dejavu] First execution including JIT compilation took 0.5113849639892578s.
+[triton-dejavu] First execution including JIT compilation took 0.44628405570983887s.
+[triton-dejavu] First execution including JIT compilation took 0.730881929397583s.
+[triton-dejavu] First execution including JIT compilation took 0.538200855255127s.
+[triton-dejavu] First execution including JIT compilation took 0.45828986167907715s.
+[triton-dejavu] First execution including JIT compilation took 0.8166990280151367s.
+[triton-dejavu] First execution including JIT compilation took 0.5725693702697754s.
+[triton-dejavu] First execution including JIT compilation took 0.42383289337158203s.
+[triton-dejavu] First execution including JIT compilation took 0.7243595123291016s.
+[triton-dejavu] First execution including JIT compilation took 0.578228235244751s.
+[triton-dejavu] First execution including JIT compilation took 0.3952150344848633s.
+[triton-dejavu] First execution including JIT compilation took 0.7390725612640381s.
+[triton-dejavu] First execution including JIT compilation took 0.6426718235015869s.
+[triton-dejavu] First execution including JIT compilation took 0.4642622470855713s.
+[triton-dejavu] First execution including JIT compilation took 0.7362449169158936s.
+[triton-dejavu] First execution including JIT compilation took 0.4661426544189453s.
+[triton-dejavu] First execution including JIT compilation took 0.37535643577575684s.
+[triton-dejavu] First execution including JIT compilation took 0.962334156036377s.
+[triton-dejavu] First execution including JIT compilation took 0.7253522872924805s.
+[triton-dejavu] First execution including JIT compilation took 0.5154576301574707s.
+[triton-dejavu] First execution including JIT compilation took 1.1596920490264893s.
+[triton-dejavu] First execution including JIT compilation took 0.693903923034668s.
+[triton-dejavu] First execution including JIT compilation took 0.542579174041748s.
+[triton-dejavu] First execution including JIT compilation took 1.143411636352539s.
+[triton-dejavu] First execution including JIT compilation took 0.5730850696563721s.
+[triton-dejavu] First execution including JIT compilation took 0.45581793785095215s.
+[triton-dejavu] First execution including JIT compilation took 1.1371288299560547s.
+[triton-dejavu] First execution including JIT compilation took 0.6347072124481201s.
+[triton-dejavu] First execution including JIT compilation took 0.5362868309020996s.
+[triton-dejavu] First execution including JIT compilation took 1.1827235221862793s.
+[triton-dejavu] First execution including JIT compilation took 0.6695809364318848s.
+[triton-dejavu] First execution including JIT compilation took 0.594818115234375s.
+bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.4952049255371094s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.38374853134155273s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.34327268600463867s.
+[triton-dejavu] First execution including JIT compilation took 0.5115656852722168s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.41890788078308105s.
+[triton-dejavu] First execution including JIT compilation took 0.36414170265197754s.
+[triton-dejavu] First execution including JIT compilation took 0.551964282989502s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.43802404403686523s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3844606876373291s.
+[triton-dejavu] First execution including JIT compilation took 0.5653436183929443s.
+[triton-dejavu] First execution including JIT compilation took 0.4929649829864502s.
+[triton-dejavu] First execution including JIT compilation took 0.4082679748535156s.
+[triton-dejavu] First execution including JIT compilation took 0.6021175384521484s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.479703426361084s.
+[triton-dejavu] First execution including JIT compilation took 0.4264242649078369s.
+[triton-dejavu] First execution including JIT compilation took 0.625530481338501s.
+[triton-dejavu] First execution including JIT compilation took 0.5222625732421875s.
+[triton-dejavu] First execution including JIT compilation took 0.4469916820526123s.
+[triton-dejavu] First execution including JIT compilation took 0.6354126930236816s.
+[triton-dejavu] First execution including JIT compilation took 0.5203869342803955s.
+[triton-dejavu] First execution including JIT compilation took 0.46642088890075684s.
+[triton-dejavu] First execution including JIT compilation took 0.588517427444458s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.49373459815979004s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3735010623931885s.
+[triton-dejavu] First execution including JIT compilation took 0.5814676284790039s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4442164897918701s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3985586166381836s.
+[triton-dejavu] First execution including JIT compilation took 0.6262340545654297s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.47362422943115234s.
+[triton-dejavu] First execution including JIT compilation took 0.41541028022766113s.
+[triton-dejavu] First execution including JIT compilation took 0.8599624633789062s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6292743682861328s.
+[triton-dejavu] First execution including JIT compilation took 0.4856271743774414s.
+[triton-dejavu] First execution including JIT compilation took 0.6879935264587402s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5188641548156738s.
+[triton-dejavu] First execution including JIT compilation took 0.4481801986694336s.
+[triton-dejavu] First execution including JIT compilation took 0.682525634765625s.
+[triton-dejavu] First execution including JIT compilation took 0.603325605392456s.
+[triton-dejavu] First execution including JIT compilation took 0.45879626274108887s.
+[triton-dejavu] First execution including JIT compilation took 0.7078754901885986s.
+[triton-dejavu] First execution including JIT compilation took 0.5560562610626221s.
+[triton-dejavu] First execution including JIT compilation took 0.45784831047058105s.
+[triton-dejavu] First execution including JIT compilation took 0.67889404296875s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.45562100410461426s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.38816308975219727s.
+[triton-dejavu] First execution including JIT compilation took 0.6418576240539551s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.48399782180786133s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.41680216789245605s.
+[triton-dejavu] First execution including JIT compilation took 0.7157330513000488s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5226426124572754s.
+[triton-dejavu] First execution including JIT compilation took 0.44080543518066406s.
+[triton-dejavu] First execution including JIT compilation took 0.7858779430389404s.
+[triton-dejavu] First execution including JIT compilation took 0.5671470165252686s.
+[triton-dejavu] First execution including JIT compilation took 0.45592260360717773s.
+[triton-dejavu] First execution including JIT compilation took 0.8100578784942627s.
+[triton-dejavu] First execution including JIT compilation took 0.6213173866271973s.
+[triton-dejavu] First execution including JIT compilation took 0.47237181663513184s.
+[triton-dejavu] First execution including JIT compilation took 0.7891368865966797s.
+[triton-dejavu] First execution including JIT compilation took 0.6662912368774414s.
+[triton-dejavu] First execution including JIT compilation took 0.4879744052886963s.
+[triton-dejavu] First execution including JIT compilation took 0.731757640838623s.
+[triton-dejavu] First execution including JIT compilation took 0.4918680191040039s.
+[triton-dejavu] First execution including JIT compilation took 0.37989187240600586s.
+[triton-dejavu] First execution including JIT compilation took 0.6664383411407471s.
+[triton-dejavu] First execution including JIT compilation took 0.4817678928375244s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3559696674346924s.
+[triton-dejavu] First execution including JIT compilation took 0.8642761707305908s.
+[triton-dejavu] First execution including JIT compilation took 0.5662164688110352s.
+[triton-dejavu] First execution including JIT compilation took 0.45751142501831055s.
+[triton-dejavu] First execution including JIT compilation took 0.9735383987426758s.
+[triton-dejavu] First execution including JIT compilation took 0.6600606441497803s.
+[triton-dejavu] First execution including JIT compilation took 0.48941469192504883s.
+[triton-dejavu] First execution including JIT compilation took 1.0599989891052246s.
+[triton-dejavu] First execution including JIT compilation took 0.5858447551727295s.
+[triton-dejavu] First execution including JIT compilation took 0.40030384063720703s.
+[triton-dejavu] First execution including JIT compilation took 0.9032082557678223s.
+[triton-dejavu] First execution including JIT compilation took 0.5963606834411621s.
+[triton-dejavu] First execution including JIT compilation took 0.5698938369750977s.
+[triton-dejavu] First execution including JIT compilation took 0.9204597473144531s.
+[triton-dejavu] First execution including JIT compilation took 0.7513656616210938s.
+[triton-dejavu] First execution including JIT compilation took 0.5392777919769287s.
+[triton-dejavu] First execution including JIT compilation took 1.3184521198272705s.
+[triton-dejavu] First execution including JIT compilation took 0.7888948917388916s.
+[triton-dejavu] First execution including JIT compilation took 0.6177425384521484s.
+[triton-dejavu] First execution including JIT compilation took 1.1905827522277832s.
+[triton-dejavu] First execution including JIT compilation took 0.7364373207092285s.
+[triton-dejavu] First execution including JIT compilation took 0.5242094993591309s.
+[triton-dejavu] First execution including JIT compilation took 1.2864527702331543s.
+[triton-dejavu] First execution including JIT compilation took 0.8166484832763672s.
+[triton-dejavu] First execution including JIT compilation took 0.5594861507415771s.
+[triton-dejavu] First execution including JIT compilation took 1.8655834197998047s.
+[triton-dejavu] First execution including JIT compilation took 0.9145352840423584s.
+[triton-dejavu] First execution including JIT compilation took 0.6113896369934082s.
+[triton-dejavu] First execution including JIT compilation took 1.9301745891571045s.
+[triton-dejavu] First execution including JIT compilation took 1.0628697872161865s.
+[triton-dejavu] First execution including JIT compilation took 0.64133620262146s.
+[triton-dejavu] First execution including JIT compilation took 2.2749366760253906s.
+[triton-dejavu] First execution including JIT compilation took 1.0524189472198486s.
+[triton-dejavu] First execution including JIT compilation took 0.677316427230835s.
+bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.6699347496032715s.
+[triton-dejavu] First execution including JIT compilation took 0.39471912384033203s.
+[triton-dejavu] First execution including JIT compilation took 0.31299400329589844s.
+[triton-dejavu] First execution including JIT compilation took 0.9980213642120361s.
+[triton-dejavu] First execution including JIT compilation took 0.4564340114593506s.
+[triton-dejavu] First execution including JIT compilation took 0.39405226707458496s.
+[triton-dejavu] First execution including JIT compilation took 0.721914529800415s.
+[triton-dejavu] First execution including JIT compilation took 0.5424695014953613s.
+[triton-dejavu] First execution including JIT compilation took 0.41809797286987305s.
+[triton-dejavu] First execution including JIT compilation took 0.7378096580505371s.
+[triton-dejavu] First execution including JIT compilation took 0.538069486618042s.
+[triton-dejavu] First execution including JIT compilation took 0.43320608139038086s.
+[triton-dejavu] First execution including JIT compilation took 0.8680074214935303s.
+[triton-dejavu] First execution including JIT compilation took 0.5815584659576416s.
+[triton-dejavu] First execution including JIT compilation took 0.44110822677612305s.
+[triton-dejavu] First execution including JIT compilation took 0.797199010848999s.
+[triton-dejavu] First execution including JIT compilation took 0.7567603588104248s.
+[triton-dejavu] First execution including JIT compilation took 0.47153782844543457s.
+[triton-dejavu] First execution including JIT compilation took 0.8809914588928223s.
+[triton-dejavu] First execution including JIT compilation took 0.6448085308074951s.
+[triton-dejavu] First execution including JIT compilation took 0.5167965888977051s.
+[triton-dejavu] First execution including JIT compilation took 0.745863676071167s.
+[triton-dejavu] First execution including JIT compilation took 0.5225260257720947s.
+[triton-dejavu] First execution including JIT compilation took 0.4189014434814453s.
+[triton-dejavu] First execution including JIT compilation took 0.7760834693908691s.
+[triton-dejavu] First execution including JIT compilation took 0.5539810657501221s.
+[triton-dejavu] First execution including JIT compilation took 0.44478821754455566s.
+[triton-dejavu] First execution including JIT compilation took 0.8012809753417969s.
+[triton-dejavu] First execution including JIT compilation took 0.6483604907989502s.
+[triton-dejavu] First execution including JIT compilation took 0.4678480625152588s.
+[triton-dejavu] First execution including JIT compilation took 0.8454635143280029s.
+[triton-dejavu] First execution including JIT compilation took 0.6168031692504883s.
+[triton-dejavu] First execution including JIT compilation took 0.6612381935119629s.
+[triton-dejavu] First execution including JIT compilation took 0.8955931663513184s.
+[triton-dejavu] First execution including JIT compilation took 0.6233341693878174s.
+[triton-dejavu] First execution including JIT compilation took 0.4918363094329834s.
+[triton-dejavu] First execution including JIT compilation took 0.9699711799621582s.
+[triton-dejavu] First execution including JIT compilation took 0.6717431545257568s.
+[triton-dejavu] First execution including JIT compilation took 0.5321164131164551s.
+[triton-dejavu] First execution including JIT compilation took 1.006484031677246s.
+[triton-dejavu] First execution including JIT compilation took 0.7167990207672119s.
+[triton-dejavu] First execution including JIT compilation took 0.5426886081695557s.
+[triton-dejavu] First execution including JIT compilation took 0.8549697399139404s.
+[triton-dejavu] First execution including JIT compilation took 0.5004158020019531s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.42880892753601074s.
+[triton-dejavu] First execution including JIT compilation took 0.8870432376861572s.
+[triton-dejavu] First execution including JIT compilation took 0.5380675792694092s.
+[triton-dejavu] First execution including JIT compilation took 0.44513392448425293s.
+[triton-dejavu] First execution including JIT compilation took 1.0106170177459717s.
+[triton-dejavu] First execution including JIT compilation took 0.6438839435577393s.
+[triton-dejavu] First execution including JIT compilation took 0.48810815811157227s.
+[triton-dejavu] First execution including JIT compilation took 1.1047391891479492s.
+[triton-dejavu] First execution including JIT compilation took 0.6829500198364258s.
+[triton-dejavu] First execution including JIT compilation took 0.5343265533447266s.
+[triton-dejavu] First execution including JIT compilation took 1.1722900867462158s.
+[triton-dejavu] First execution including JIT compilation took 0.7511520385742188s.
+[triton-dejavu] First execution including JIT compilation took 0.5391092300415039s.
+[triton-dejavu] First execution including JIT compilation took 1.2446460723876953s.
+[triton-dejavu] First execution including JIT compilation took 0.7718749046325684s.
+[triton-dejavu] First execution including JIT compilation took 0.549095630645752s.
+[triton-dejavu] First execution including JIT compilation took 1.3546397686004639s.
+[triton-dejavu] First execution including JIT compilation took 0.7892618179321289s.
+[triton-dejavu] First execution including JIT compilation took 0.46314549446105957s.
+[triton-dejavu] First execution including JIT compilation took 0.9860119819641113s.
+[triton-dejavu] First execution including JIT compilation took 0.8724544048309326s.
+[triton-dejavu] First execution including JIT compilation took 0.4373140335083008s.
+[triton-dejavu] First execution including JIT compilation took 1.0243175029754639s.
+[triton-dejavu] First execution including JIT compilation took 0.6186015605926514s.
+[triton-dejavu] First execution including JIT compilation took 0.4280831813812256s.
+[triton-dejavu] First execution including JIT compilation took 1.5726463794708252s.
+[triton-dejavu] First execution including JIT compilation took 0.9008209705352783s.
+[triton-dejavu] First execution including JIT compilation took 0.44704723358154297s.
+[triton-dejavu] First execution including JIT compilation took 1.6724953651428223s.
+[triton-dejavu] First execution including JIT compilation took 0.8446671962738037s.
+[triton-dejavu] First execution including JIT compilation took 0.4729273319244385s.
+[triton-dejavu] First execution including JIT compilation took 1.7400548458099365s.
+[triton-dejavu] First execution including JIT compilation took 0.8373644351959229s.
+[triton-dejavu] First execution including JIT compilation took 0.5513191223144531s.
+[triton-dejavu] First execution including JIT compilation took 1.8062167167663574s.
+[triton-dejavu] First execution including JIT compilation took 0.8943934440612793s.
+[triton-dejavu] First execution including JIT compilation took 0.5011510848999023s.
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.683664321899414s.
+[triton-dejavu] First execution including JIT compilation took 0.8943469524383545s.
+[triton-dejavu] First execution including JIT compilation took 0.5135440826416016s.
+[triton-dejavu] First execution including JIT compilation took 1.9431862831115723s.
+[triton-dejavu] First execution including JIT compilation took 1.2609891891479492s.
+[triton-dejavu] First execution including JIT compilation took 0.8276565074920654s.
+[triton-dejavu] First execution including JIT compilation took 6.717592477798462s.
+[triton-dejavu] First execution including JIT compilation took 1.4402704238891602s.
+[triton-dejavu] First execution including JIT compilation took 0.6466906070709229s.
+bench_cudagraph failed with out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.9890782833099365s.
+[triton-dejavu] First execution including JIT compilation took 0.6056652069091797s.
+[triton-dejavu] First execution including JIT compilation took 0.37941598892211914s.
+[triton-dejavu] First execution including JIT compilation took 0.9751529693603516s.
+[triton-dejavu] First execution including JIT compilation took 0.5807168483734131s.
+[triton-dejavu] First execution including JIT compilation took 0.46853089332580566s.
+[triton-dejavu] First execution including JIT compilation took 1.0114789009094238s.
+[triton-dejavu] First execution including JIT compilation took 0.6656544208526611s.
+[triton-dejavu] First execution including JIT compilation took 0.4591987133026123s.
+[triton-dejavu] First execution including JIT compilation took 1.2622675895690918s.
+[triton-dejavu] First execution including JIT compilation took 0.7641468048095703s.
+[triton-dejavu] First execution including JIT compilation took 0.5717291831970215s.
+[triton-dejavu] First execution including JIT compilation took 1.4783613681793213s.
+[triton-dejavu] First execution including JIT compilation took 0.674863338470459s.
+[triton-dejavu] First execution including JIT compilation took 0.4595465660095215s.
+[triton-dejavu] First execution including JIT compilation took 1.3642892837524414s.
+[triton-dejavu] First execution including JIT compilation took 0.6485683917999268s.
+[triton-dejavu] First execution including JIT compilation took 0.48433613777160645s.
+[triton-dejavu] First execution including JIT compilation took 1.2521538734436035s.
+[triton-dejavu] First execution including JIT compilation took 0.6855838298797607s.
+[triton-dejavu] First execution including JIT compilation took 0.5421812534332275s.
+[triton-dejavu] First execution including JIT compilation took 1.2270712852478027s.
+[triton-dejavu] First execution including JIT compilation took 0.5855932235717773s.
+[triton-dejavu] First execution including JIT compilation took 0.3942844867706299s.
+[triton-dejavu] First execution including JIT compilation took 1.0544099807739258s.
+[triton-dejavu] First execution including JIT compilation took 0.6344761848449707s.
+[triton-dejavu] First execution including JIT compilation took 0.4506490230560303s.
+[triton-dejavu] First execution including JIT compilation took 1.6647655963897705s.
+[triton-dejavu] First execution including JIT compilation took 0.7378494739532471s.
+[triton-dejavu] First execution including JIT compilation took 0.4730367660522461s.
+[triton-dejavu] First execution including JIT compilation took 1.290454387664795s.
+[triton-dejavu] First execution including JIT compilation took 0.75484299659729s.
+[triton-dejavu] First execution including JIT compilation took 0.483842134475708s.
+[triton-dejavu] First execution including JIT compilation took 1.2890782356262207s.
+[triton-dejavu] First execution including JIT compilation took 0.7153482437133789s.
+[triton-dejavu] First execution including JIT compilation took 0.5798866748809814s.
+[triton-dejavu] First execution including JIT compilation took 1.450512170791626s.
+[triton-dejavu] First execution including JIT compilation took 0.7689251899719238s.
+[triton-dejavu] First execution including JIT compilation took 0.5422661304473877s.
+[triton-dejavu] First execution including JIT compilation took 1.3612980842590332s.
+[triton-dejavu] First execution including JIT compilation took 0.7846195697784424s.
+[triton-dejavu] First execution including JIT compilation took 0.6903204917907715s.
+[triton-dejavu] First execution including JIT compilation took 1.3367185592651367s.
+[triton-dejavu] First execution including JIT compilation took 0.7425005435943604s.
+[triton-dejavu] First execution including JIT compilation took 0.5718681812286377s.
+[triton-dejavu] First execution including JIT compilation took 1.687856674194336s.
+[triton-dejavu] First execution including JIT compilation took 0.6989390850067139s.
+[triton-dejavu] First execution including JIT compilation took 0.45311927795410156s.
+[triton-dejavu] First execution including JIT compilation took 1.8834350109100342s.
+[triton-dejavu] First execution including JIT compilation took 0.7894504070281982s.
+[triton-dejavu] First execution including JIT compilation took 0.5649204254150391s.
+[triton-dejavu] First execution including JIT compilation took 2.243360757827759s.
+[triton-dejavu] First execution including JIT compilation took 0.8881011009216309s.
+[triton-dejavu] First execution including JIT compilation took 0.6061875820159912s.
+[triton-dejavu] First execution including JIT compilation took 2.1226956844329834s.
+[triton-dejavu] First execution including JIT compilation took 0.8810961246490479s.
+[triton-dejavu] First execution including JIT compilation took 0.665024995803833s.
+[triton-dejavu] First execution including JIT compilation took 2.3865857124328613s.
+[triton-dejavu] First execution including JIT compilation took 0.9415686130523682s.
+[triton-dejavu] First execution including JIT compilation took 0.6442673206329346s.
+bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.141575574874878s.
+[triton-dejavu] First execution including JIT compilation took 0.9174015522003174s.
+[triton-dejavu] First execution including JIT compilation took 0.5408012866973877s.
+[triton-dejavu] First execution including JIT compilation took 2.082519292831421s.
+[triton-dejavu] First execution including JIT compilation took 1.0053105354309082s.
+[triton-dejavu] First execution including JIT compilation took 0.5957515239715576s.
+[triton-dejavu] First execution including JIT compilation took 6.80155086517334s.
+[triton-dejavu] First execution including JIT compilation took 1.5097930431365967s.
+[triton-dejavu] First execution including JIT compilation took 0.9143364429473877s.
+[triton-dejavu] First execution including JIT compilation took 6.931457042694092s.
+[triton-dejavu] First execution including JIT compilation took 1.593360185623169s.
+[triton-dejavu] First execution including JIT compilation took 0.7019162178039551s.
+bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 4.745112180709839s.
+[triton-dejavu] First execution including JIT compilation took 1.5900769233703613s.
+[triton-dejavu] First execution including JIT compilation took 0.7577598094940186s.
+[triton-dejavu] First execution including JIT compilation took 4.816965103149414s.
+[triton-dejavu] First execution including JIT compilation took 1.6154999732971191s.
+[triton-dejavu] First execution including JIT compilation took 0.8251774311065674s.
+bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.4465477466583252s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3604874610900879s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3589968681335449s.
+[triton-dejavu] First execution including JIT compilation took 0.4949653148651123s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4129481315612793s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.37282252311706543s.
+[triton-dejavu] First execution including JIT compilation took 0.49967503547668457s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4030342102050781s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.38671374320983887s.
+[triton-dejavu] First execution including JIT compilation took 0.6607780456542969s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5266311168670654s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.593902587890625s.
+[triton-dejavu] First execution including JIT compilation took 0.5250449180603027s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4251673221588135s.
+[triton-dejavu] First execution including JIT compilation took 0.41414403915405273s.
+[triton-dejavu] First execution including JIT compilation took 0.5248408317565918s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.46840643882751465s.
+[triton-dejavu] First execution including JIT compilation took 0.46053147315979004s.
+[triton-dejavu] First execution including JIT compilation took 0.5807092189788818s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4862945079803467s.
+[triton-dejavu] First execution including JIT compilation took 0.4458580017089844s.
+[triton-dejavu] First execution including JIT compilation took 0.729177713394165s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.40300631523132324s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3684837818145752s.
+[triton-dejavu] First execution including JIT compilation took 0.5131044387817383s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4550745487213135s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.39257073402404785s.
+[triton-dejavu] First execution including JIT compilation took 0.5535814762115479s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4340968132019043s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4005141258239746s.
+[triton-dejavu] First execution including JIT compilation took 0.737922191619873s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5746960639953613s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.397491455078125s.
+[triton-dejavu] First execution including JIT compilation took 0.5818440914154053s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6529510021209717s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4184551239013672s.
+[triton-dejavu] First execution including JIT compilation took 0.6207692623138428s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4892761707305908s.
+[triton-dejavu] First execution including JIT compilation took 0.44497179985046387s.
+[triton-dejavu] First execution including JIT compilation took 0.635669469833374s.
+[triton-dejavu] First execution including JIT compilation took 0.5256602764129639s.
+[triton-dejavu] First execution including JIT compilation took 0.48749327659606934s.
+[triton-dejavu] First execution including JIT compilation took 0.5720036029815674s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.42984604835510254s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.38751769065856934s.
+[triton-dejavu] First execution including JIT compilation took 0.6071627140045166s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4755427837371826s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3990035057067871s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.61566162109375s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4701671600341797s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4040553569793701s.
+[triton-dejavu] First execution including JIT compilation took 0.6511368751525879s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5009520053863525s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4353518486022949s.
+[triton-dejavu] First execution including JIT compilation took 0.6790196895599365s.
+[triton-dejavu] First execution including JIT compilation took 0.5248758792877197s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4504244327545166s.
+[triton-dejavu] First execution including JIT compilation took 0.7124292850494385s.
+[triton-dejavu] First execution including JIT compilation took 0.8151717185974121s.
+[triton-dejavu] First execution including JIT compilation took 0.4823343753814697s.
+[triton-dejavu] First execution including JIT compilation took 0.772719144821167s.
+[triton-dejavu] First execution including JIT compilation took 0.6169350147247314s.
+[triton-dejavu] First execution including JIT compilation took 0.5196678638458252s.
+[triton-dejavu] First execution including JIT compilation took 0.7186233997344971s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4982566833496094s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4104623794555664s.
+[triton-dejavu] First execution including JIT compilation took 1.1141915321350098s.
+[triton-dejavu] First execution including JIT compilation took 0.5356433391571045s.
+[triton-dejavu] First execution including JIT compilation took 0.4406569004058838s.
+[triton-dejavu] First execution including JIT compilation took 0.8371496200561523s.
+[triton-dejavu] First execution including JIT compilation took 0.5642838478088379s.
+[triton-dejavu] First execution including JIT compilation took 0.4726717472076416s.
+[triton-dejavu] First execution including JIT compilation took 0.93656325340271s.
+[triton-dejavu] First execution including JIT compilation took 0.6194779872894287s.
+[triton-dejavu] First execution including JIT compilation took 0.4953165054321289s.
+[triton-dejavu] First execution including JIT compilation took 0.9690747261047363s.
+[triton-dejavu] First execution including JIT compilation took 0.6501588821411133s.
+[triton-dejavu] First execution including JIT compilation took 0.5288493633270264s.
+[triton-dejavu] First execution including JIT compilation took 1.2569010257720947s.
+[triton-dejavu] First execution including JIT compilation took 0.6968162059783936s.
+[triton-dejavu] First execution including JIT compilation took 0.5399911403656006s.
+[triton-dejavu] First execution including JIT compilation took 1.2143075466156006s.
+[triton-dejavu] First execution including JIT compilation took 0.733314037322998s.
+[triton-dejavu] First execution including JIT compilation took 0.6001999378204346s.
+[triton-dejavu] First execution including JIT compilation took 1.1585540771484375s.
+[triton-dejavu] First execution including JIT compilation took 0.7061564922332764s.
+[triton-dejavu] First execution including JIT compilation took 0.509422779083252s.
+[triton-dejavu] First execution including JIT compilation took 1.1820228099822998s.
+[triton-dejavu] First execution including JIT compilation took 0.7445404529571533s.
+[triton-dejavu] First execution including JIT compilation took 0.4977116584777832s.
+[triton-dejavu] First execution including JIT compilation took 1.091106653213501s.
+[triton-dejavu] First execution including JIT compilation took 0.7330291271209717s.
+[triton-dejavu] First execution including JIT compilation took 0.5066168308258057s.
+[triton-dejavu] First execution including JIT compilation took 1.6019270420074463s.
+[triton-dejavu] First execution including JIT compilation took 0.9132928848266602s.
+[triton-dejavu] First execution including JIT compilation took 0.6490397453308105s.
+[triton-dejavu] First execution including JIT compilation took 1.566523790359497s.
+[triton-dejavu] First execution including JIT compilation took 0.9093658924102783s.
+[triton-dejavu] First execution including JIT compilation took 0.5752038955688477s.
+[triton-dejavu] First execution including JIT compilation took 1.3499906063079834s.
+[triton-dejavu] First execution including JIT compilation took 0.7472167015075684s.
+[triton-dejavu] First execution including JIT compilation took 0.5233032703399658s.
+bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.4235203266143799s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.367872953414917s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3045461177825928s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4933462142944336s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5991702079772949s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3020298480987549s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.43563389778137207s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3850095272064209s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.40093398094177246s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6140017509460449s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.48171281814575195s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5723874568939209s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5938704013824463s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7248561382293701s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5335805416107178s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.632122278213501s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5055139064788818s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.45116615295410156s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6221024990081787s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5203642845153809s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4454641342163086s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5253396034240723s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.413818359375s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.37888503074645996s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5810997486114502s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6574397087097168s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.39346885681152344s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.62447190284729s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.478407621383667s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4812755584716797s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6491189002990723s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5157642364501953s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.42938828468322754s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.680762529373169s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5025956630706787s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.43789172172546387s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.662214994430542s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5385723114013672s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.42354893684387207s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7007582187652588s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5846168994903564s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.46515345573425293s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9489858150482178s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4519827365875244s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3820490837097168s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6905148029327393s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4977149963378906s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.41359424591064453s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7216389179229736s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5443611145019531s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.44051408767700195s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7794044017791748s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.562237024307251s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5855629444122314s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8041880130767822s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5811121463775635s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4584963321685791s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.0672459602355957s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.586475133895874s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.49823427200317383s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8634672164916992s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6550483703613281s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5157911777496338s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8354160785675049s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5146996974945068s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4071619510650635s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9271225929260254s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.580986738204956s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4576835632324219s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.111537218093872s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6412017345428467s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.49378371238708496s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.0859580039978027s.
+[triton-dejavu] First execution including JIT compilation took 0.6884167194366455s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5569989681243896s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.1242315769195557s.
+[triton-dejavu] First execution including JIT compilation took 0.7313017845153809s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5240278244018555s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.2283635139465332s.
+[triton-dejavu] First execution including JIT compilation took 0.7234997749328613s.
+[triton-dejavu] First execution including JIT compilation took 0.5484554767608643s.
+[triton-dejavu] First execution including JIT compilation took 1.4758551120758057s.
+[triton-dejavu] First execution including JIT compilation took 0.6073637008666992s.
+[triton-dejavu] First execution including JIT compilation took 0.513019323348999s.
+[triton-dejavu] First execution including JIT compilation took 1.0995526313781738s.
+[triton-dejavu] First execution including JIT compilation took 0.6934101581573486s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4188697338104248s.
+[triton-dejavu] First execution including JIT compilation took 1.2958614826202393s.
+[triton-dejavu] First execution including JIT compilation took 0.8936920166015625s.
+[triton-dejavu] First execution including JIT compilation took 0.6156136989593506s.
+[triton-dejavu] First execution including JIT compilation took 1.6940598487854004s.
+[triton-dejavu] First execution including JIT compilation took 0.9992377758026123s.
+[triton-dejavu] First execution including JIT compilation took 0.6807270050048828s.
+[triton-dejavu] First execution including JIT compilation took 1.7915706634521484s.
+[triton-dejavu] First execution including JIT compilation took 0.8958044052124023s.
+[triton-dejavu] First execution including JIT compilation took 0.6192543506622314s.
+[triton-dejavu] First execution including JIT compilation took 1.4820353984832764s.
+[triton-dejavu] First execution including JIT compilation took 0.9619314670562744s.
+[triton-dejavu] First execution including JIT compilation took 0.6009480953216553s.
+bench_cudagraph failed with out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.5543904304504395s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.41063737869262695s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.30236029624938965s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5460262298583984s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.50726318359375s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.42197704315185547s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.686154842376709s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5402421951293945s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.44283580780029297s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7077608108520508s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5410175323486328s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.47356486320495605s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6953809261322021s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7652671337127686s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.45247840881347656s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7314624786376953s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5340700149536133s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.47040677070617676s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7279396057128906s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5927844047546387s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5247375965118408s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6629395484924316s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.49787163734436035s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.39687013626098633s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7211334705352783s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5556731224060059s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6893763542175293s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7504291534423828s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5790531635284424s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.47012805938720703s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6832501888275146s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4917154312133789s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.35793185234069824s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.0028676986694335938s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4932436943054199s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3936727046966553s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6478121280670166s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5344371795654297s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3720393180847168s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6856989860534668s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.49400806427001953s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.40635251998901367s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6419022083282471s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.42707204818725586s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.37799835205078125s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7089602947235107s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6736738681793213s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3756542205810547s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.805124044418335s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5075352191925049s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3723928928375244s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7105093002319336s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5702188014984131s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.374800443649292s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8257863521575928s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.562946081161499s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.42319226264953613s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7553699016571045s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5543286800384521s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4214789867401123s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7838122844696045s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5473670959472656s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.45372581481933594s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7879917621612549s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4837973117828369s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.39473915100097656s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8744144439697266s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6230897903442383s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5225625038146973s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.4133057594299316s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.0481688976287842s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.756004810333252s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.3453631401062012s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6478581428527832s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.45032429695129395s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.1815299987792969s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6545298099517822s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7831099033355713s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.3497538566589355s.
+[triton-dejavu] First execution including JIT compilation took 0.6530613899230957s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.46605396270751953s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+bench_cudagraph failed with out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.367077350616455s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7068085670471191s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.44535279273986816s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.4023311138153076s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7958266735076904s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.507124662399292s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 2.0259878635406494s.
+[triton-dejavu] First execution including JIT compilation took 0.9261250495910645s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5468614101409912s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 2.3781609535217285s.
+[triton-dejavu] First execution including JIT compilation took 1.0091207027435303s.
+[triton-dejavu] First execution including JIT compilation took 0.6951940059661865s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.7430386543273926s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5558526515960693s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4275531768798828s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8700566291809082s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5754470825195312s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.448347806930542s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.90216064453125s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6065723896026611s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.47638845443725586s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9279463291168213s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7985544204711914s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4977383613586426s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9453699588775635s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4945030212402344s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3977932929992676s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7945261001586914s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.549720287322998s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.42061400413513184s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9728484153747559s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5585596561431885s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4591073989868164s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7104089260101318s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4560739994049072s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3498101234436035s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8379316329956055s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5130932331085205s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.36475372314453125s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8153905868530273s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5603029727935791s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.41185498237609863s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8476624488830566s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5228164196014404s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4401214122772217s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8370048999786377s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5649154186248779s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.41320371627807617s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9297363758087158s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7755241394042969s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5734150409698486s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.232421636581421s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8419132232666016s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6011636257171631s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.1239733695983887s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6979858875274658s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.487072229385376s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.2030727863311768s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7380573749542236s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5238943099975586s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.343810796737671s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7683749198913574s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5486259460449219s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.3422448635101318s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7824568748474121s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5656516551971436s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.4012060165405273s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.824357271194458s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6031546592712402s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.4797933101654053s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9887309074401855s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6871368885040283s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.6989936828613281s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9585423469543457s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.692669153213501s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.65909743309021s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6878805160522461s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4942758083343506s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.342179775238037s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7795774936676025s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4817812442779541s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.9305896759033203s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8872191905975342s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4981191158294678s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 2.046060800552368s.
+[triton-dejavu] First execution including JIT compilation took 0.9139325618743896s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5242531299591064s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 2.2648186683654785s.
+[triton-dejavu] First execution including JIT compilation took 0.8834800720214844s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7166852951049805s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+bench_cudagraph failed with out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.2592248916625977s.
+[triton-dejavu] First execution including JIT compilation took 1.0523405075073242s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.599764347076416s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 2.521293878555298s.
+[triton-dejavu] First execution including JIT compilation took 1.171839952468872s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6534533500671387s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 6.292866468429565s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.8475072383880615s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7422606945037842s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.093348503112793s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6464064121246338s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4449312686920166s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.3426687717437744s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.91444993019104s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5213837623596191s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.3889944553375244s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7329103946685791s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6597933769226074s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.227839469909668s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7042691707611084s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6446876525878906s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.8344454765319824s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9738888740539551s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.672421932220459s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.6915192604064941s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.001119613647461s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7162504196166992s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.8200452327728271s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.0510845184326172s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7335896492004395s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.6177794933319092s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9454030990600586s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5991966724395752s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.6675848960876465s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9363722801208496s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6759653091430664s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.862114667892456s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.0182960033416748s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6502413749694824s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.9105088710784912s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9901387691497803s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6667122840881348s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.7807495594024658s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8490705490112305s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5338022708892822s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.4980332851409912s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.87496018409729s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5721733570098877s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.6702356338500977s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9042339324951172s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6040554046630859s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.504411220550537s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7958929538726807s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5112464427947998s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.6463310718536377s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9492459297180176s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5592634677886963s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 2.21022367477417s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9613430500030518s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5633087158203125s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 2.2821779251098633s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.097722053527832s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6317684650421143s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 3.0794928073883057s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.2995553016662598s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8081183433532715s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 3.323143243789673s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.379629373550415s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7605845928192139s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.796154260635376s.
+[triton-dejavu] First execution including JIT compilation took 1.4310317039489746s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7960169315338135s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 3.4028375148773193s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.6688313484191895s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9481635093688965s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 8.42493486404419s.
+[triton-dejavu] First execution including JIT compilation took 1.7116987705230713s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7699902057647705s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+bench_cudagraph failed with out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 6.023736238479614s.
+[triton-dejavu] First execution including JIT compilation took 2.0444483757019043s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.062030553817749s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 6.113872051239014s.
+[triton-dejavu] First execution including JIT compilation took 2.0313453674316406s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 1.0472145080566406s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.48328304290771484s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3348958492279053s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.30452704429626465s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5489785671234131s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.33617687225341797s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5939881801605225s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5159595012664795s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.35559558868408203s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3347048759460449s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5674691200256348s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.43275880813598633s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3386096954345703s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5426011085510254s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.39165472984313965s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.35681700706481934s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5494749546051025s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.42157554626464844s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.345064640045166s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5533592700958252s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5270988941192627s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6511619091033936s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5268349647521973s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.611098051071167s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.2967538833618164s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5285255908966064s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3548440933227539s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.31319618225097656s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5639190673828125s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.39958620071411133s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.3334677219390869s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6589338779449463s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4241213798522949s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.6113383769989014s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8450291156768799s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5650513172149658s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.4908101558685303s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.7597527503967285s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5363900661468506s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.48531675338745117s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.9195475578308105s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5560624599456787s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.5028431415557861s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.8041844367980957s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.49500370025634766s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+[triton-dejavu] First execution including JIT compilation took 0.37749719619750977s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+[triton-dejavu] First execution including JIT compilation took 0.5852396488189697s.
+bench_cudagraph failed with CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
+    with torch.cuda.graph(g):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
+    self.cuda_graph.capture_end()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
+    super().capture_end()
+RuntimeError: CUDA error: out of memory
+CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+For debugging consider passing CUDA_LAUNCH_BLOCKING=1
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+[triton-dejavu] First execution including JIT compilation took 0.8381209373474121s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5893561840057373s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3753020763397217s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7169778347015381s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4236640930175781s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3638887405395508s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7483389377593994s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5384445190429688s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.658228874206543s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.886188268661499s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4752342700958252s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7237246036529541s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9241292476654053s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5497918128967285s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4125850200653076s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3035178184509277s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6299910545349121s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4645383358001709s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.442227840423584s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6872811317443848s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5252459049224854s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.6195275783538818s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9957168102264404s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5770971775054932s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.6193320751190186s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.776587724685669s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5602409839630127s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.470637559890747s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6102380752563477s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.44856834411621094s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.342865228652954s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7176856994628906s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4709174633026123s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 285696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 285696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 285696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 285696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.9359774589538574s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8123137950897217s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4981215000152588s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.2556896209716797s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0426855087280273s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5906248092651367s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.9886162281036377s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2194347381591797s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6244046688079834s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 571392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 571392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 571392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 571392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.48250651359558105s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.31485867500305176s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3440537452697754s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8460357189178467s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.36809873580932617s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3085494041442871s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5444772243499756s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.38303327560424805s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.34803223609924316s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5283372402191162s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3868238925933838s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.35518574714660645s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5908901691436768s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.41735363006591797s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6766963005065918s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5999925136566162s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.41122961044311523s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3416872024536133s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5752973556518555s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3956427574157715s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3643150329589844s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5298521518707275s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3585391044616699s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3086113929748535s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5575377941131592s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3903212547302246s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3265855312347412s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5826382637023926s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.43185901641845703s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.38982224464416504s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7225501537322998s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5456938743591309s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.49631500244140625s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.004322052001953125s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4188666343688965s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5133178234100342s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6560304164886475s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.43018031120300293s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4307105541229248s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7024564743041992s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5363326072692871s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.39928627014160156s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6894314289093018s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3780636787414551s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3666982650756836s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8828177452087402s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.47858190536499023s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3928706645965576s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8138139247894287s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.439422607421875s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.38376379013061523s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8122892379760742s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.48032069206237793s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.45058488845825195s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.86492919921875s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4958674907684326s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4237086772918701s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8849928379058838s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.486788272857666s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4373905658721924s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0987954139709473s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8294305801391602s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4790806770324707s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1772897243499756s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.521981954574585s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.40645861625671387s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.16839599609375s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5616059303283691s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.42472362518310547s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3248302936553955s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6032726764678955s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4481019973754883s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.395122766494751s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7378954887390137s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5323140621185303s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.7057254314422607s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6767878532409668s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5054512023925781s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.7137601375579834s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 234496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 234496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 234496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 234496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 318464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 318464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 318464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 318464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.6594467163085938s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3652985095977783s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5937278270721436s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.922431230545044s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2858715057373047s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5902688503265381s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.350640296936035s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5020318031311035s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6842968463897705s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 385024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 385024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 385024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 385024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 468992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 468992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 468992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 468992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 620544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 620544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 636928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 636928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 636928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 636928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.7688605785369873s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3665287494659424s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3449244499206543s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6777553558349609s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4027137756347656s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.38666725158691406s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7940988540649414s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4141719341278076s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.37494373321533203s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7313904762268066s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.42134833335876465s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3692958354949951s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7075221538543701s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.43701624870300293s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.42354393005371094s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8378398418426514s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4932551383972168s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4718587398529053s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8113245964050293s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4463827610015869s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.428286075592041s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6905107498168945s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4112887382507324s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.33467864990234375s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7086637020111084s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.45603132247924805s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7833783626556396s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9827532768249512s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5891172885894775s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.49228405952453613s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.0715341567993164s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6479880809783936s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5540139675140381s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0900757312774658s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8101351261138916s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.526641845703125s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1183600425720215s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6459150314331055s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5361812114715576s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.203599214553833s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6816227436065674s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5723059177398682s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2995352745056152s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.441802978515625s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.36841726303100586s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1730992794036865s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6147010326385498s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.48934268951416016s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2903673648834229s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5355379581451416s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.42858266830444336s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2931723594665527s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5596542358398438s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4497413635253906s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2131459712982178s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5678653717041016s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4486379623413086s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3713254928588867s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5958590507507324s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5505542755126953s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4308266639709473s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7147002220153809s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5467684268951416s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3634920120239258s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5880486965179443s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5700411796569824s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4810731410980225s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6438288688659668s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4656994342803955s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.6747398376464844s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7002549171447754s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5245516300201416s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.000950813293457s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9302749633789062s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.604921817779541s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.449615478515625s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 267264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 267264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 283648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 283648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 367616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 367616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 3.207704782485962s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2239928245544434s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6375505924224854s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.287391424179077s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2227861881256104s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6965057849884033s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 434176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 434176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 466944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 466944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 466944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 466944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 534528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 534528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 567296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 567296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 567296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 567296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 735232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 735232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 768000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 768000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 768000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 768000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.9831523895263672s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5336413383483887s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.46745753288269043s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0242087841033936s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6498258113861084s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6161227226257324s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0387804508209229s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5843358039855957s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.489365816116333s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.084639549255371s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6798868179321289s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6222915649414062s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.24820876121521s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6479372978210449s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5105531215667725s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.145951509475708s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6538543701171875s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5761466026306152s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4845623970031738s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6651618480682373s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.55460524559021s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.081782341003418s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5795166492462158s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4992384910583496s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.14192533493042s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6035377979278564s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7185189723968506s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2579710483551025s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6511590480804443s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.507249116897583s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2077677249908447s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.003390789031982422s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5164101123809814s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2660481929779053s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6785335540771484s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5961654186248779s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4194557666778564s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7268083095550537s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5461311340332031s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3952383995056152s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7429358959197998s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9075326919555664s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3474931716918945s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6649174690246582s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5712227821350098s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4214322566986084s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9512057304382324s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6910531520843506s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.6161599159240723s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7413544654846191s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7766103744506836s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.6750471591949463s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.003296375274658203s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6038086414337158s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.8977270126342773s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9200453758239746s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7148220539093018s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.773270845413208s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8617911338806152s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6090264320373535s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 241152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 241152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 257536, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257536, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 257536, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257536, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.089289903640747s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8955142498016357s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7933282852172852s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.6730310916900635s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.027360200881958s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6189002990722656s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.377192258834839s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0640830993652344s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6758365631103516s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.402773380279541s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 381952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 381952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 381952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 381952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 482304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 482304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 515072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 515072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 515072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 515072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 4.78285551071167s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.790651559829712s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8154182434082031s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 5.278247594833374s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.8989050388336182s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1762864589691162s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 432128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 432128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 698368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 698368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 763904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 763904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 763904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 763904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 964608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 964608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1030144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1030144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1030144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1030144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.577185869216919s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2491505146026611s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7602677345275879s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.791684865951538s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2859153747558594s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7977027893066406s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.6780614852905273s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3233978748321533s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8441951274871826s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.740365743637085s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2907094955444336s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.177889347076416s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.745009422302246s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3827052116394043s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9290802478790283s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.8287532329559326s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3876776695251465s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8190820217132568s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.0606014728546143s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3506171703338623s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8591070175170898s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.745933771133423s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3523740768432617s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8213198184967041s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.9839930534362793s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.496906042098999s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8184218406677246s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.0979418754577637s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5288279056549072s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8494882583618164s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.252285957336426s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.004141569137573242s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8563632965087891s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.2991631031036377s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5022201538085938s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8538022041320801s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.498495578765869s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5448570251464844s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8523283004760742s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.52825927734375s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.6031606197357178s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0104546546936035s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.137936592102051s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.556880235671997s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.949892520904541s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.461966037750244s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5957205295562744s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9467792510986328s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.070852518081665s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.6498074531555176s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9917776584625244s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.3239054679870605s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.7489638328552246s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9379489421844482s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.467419147491455s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 239616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 239616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 239616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 239616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 256512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 289280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 289280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 289280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 289280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 388608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 388608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 388608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 388608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 4.755504131317139s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.098724842071533s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0180997848510742s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 5.3917906284332275s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.3101797103881836s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0687329769134521s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 9.263354539871216s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 224.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 224.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 314368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 314368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 379904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 379904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 379904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 379904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 479232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 479232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 479232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 479232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 513024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 513024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 578560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 578560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 578560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 578560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 777216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 777216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 777216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 777216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 10.830562353134155s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 5.693915367126465s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4908199310302734s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 10.97849154472351s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 628736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 628736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 759808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 759808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 759808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 759808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 958464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 958464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 958464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 958464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1026048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1026048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1157120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1157120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1157120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1157120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1423360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1423360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1554432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1554432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1554432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1554432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.6892292499542236s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.38911986351013184s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.30687904357910156s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6879351139068604s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.42769932746887207s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.329437255859375s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7260704040527344s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4387474060058594s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.39746975898742676s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7281160354614258s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5435876846313477s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.37840747833251953s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8864538669586182s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5189304351806641s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3825032711029053s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.960721492767334s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.47132205963134766s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.39466094970703125s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9138970375061035s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5223879814147949s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3736448287963867s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7308955192565918s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.44620800018310547s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.33580613136291504s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8977346420288086s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7137322425842285s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.34291768074035645s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8276610374450684s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5064258575439453s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3502078056335449s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.067542552947998s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5365092754364014s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3644242286682129s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8512518405914307s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5115513801574707s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3718876838684082s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.920119047164917s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5264711380004883s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4116835594177246s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9466478824615479s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.566298246383667s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.43740200996398926s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1071062088012695s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.526606559753418s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.45633745193481445s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.177720069885254s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.768721342086792s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4274454116821289s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4301061630249023s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6415538787841797s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.42043113708496094s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3885080814361572s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6621842384338379s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4451918601989746s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3652503490447998s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7669777870178223s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4610159397125244s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5304932594299316s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7010109424591064s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5196688175201416s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.7303383350372314s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1459696292877197s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.44887781143188477s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.7154202461242676s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9417815208435059s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5006546974182129s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.349583625793457s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0592787265777588s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5471467971801758s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 319488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 319488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 319488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 319488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 386048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 386048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 390144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 390144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 390144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 390144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 5.510880470275879s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.307586193084717s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8460187911987305s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.805698871612549s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.5006825923919678s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8813536167144775s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 356352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 356352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 356352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 356352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 638976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 638976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 638976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 638976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 772096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 772096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 780288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 780288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 780288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 780288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 0.8435537815093994s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4684276580810547s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3505737781524658s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8157010078430176s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5323681831359863s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.37955546379089355s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0491282939910889s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6324632167816162s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3687443733215332s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9693076610565186s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5698964595794678s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3987698554992676s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.994401216506958s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5128960609436035s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.39620423316955566s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8957595825195312s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.558398962020874s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.38295984268188477s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9372365474700928s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5337975025177002s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4269568920135498s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9156548976898193s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.49073123931884766s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3529493808746338s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9221329689025879s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5333900451660156s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.38635730743408203s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1536855697631836s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5275108814239502s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.41078877449035645s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0259864330291748s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.0027832984924316406s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.42501282691955566s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0600135326385498s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6524257659912109s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.40082621574401855s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0696709156036377s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6272509098052979s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4128298759460449s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1351423263549805s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.889866828918457s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4912388324737549s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3318710327148438s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5907411575317383s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.41211819648742676s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3889124393463135s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8929169178009033s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.46093177795410156s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5692577362060547s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7735788822174072s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.47498464584350586s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.6411559581756592s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7738308906555176s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6725783348083496s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.7961504459381104s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9248149394989014s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5011544227600098s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.7031898498535156s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0630671977996826s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5364012718200684s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 278016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.6309447288513184s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8956303596496582s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.48720335960388184s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.0003604888916016s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9654979705810547s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6053953170776367s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.7849512100219727s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1027169227600098s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6578397750854492s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 406528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 406528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 556032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 556032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 6.886560678482056s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.371800422668457s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1015229225158691s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 5.3806397914886475s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.614715576171875s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.013362169265747s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 813056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 813056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1112064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1112064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.0904018878936768s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5664336681365967s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3864610195159912s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4100468158721924s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.591252326965332s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3967752456665039s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1207458972930908s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6113357543945312s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4384629726409912s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3609027862548828s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6447341442108154s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4421412944793701s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.172593355178833s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6391100883483887s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.43094921112060547s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3330214023590088s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6399593353271484s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4591398239135742s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2691125869750977s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7364680767059326s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.45516157150268555s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.197962999343872s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6473965644836426s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.38663530349731445s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4497623443603516s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6786634922027588s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.45200419425964355s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3861651420593262s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7120561599731445s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.43462252616882324s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4068715572357178s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.0032808780670166016s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.46666932106018066s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.662893533706665s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7151412963867188s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.47615790367126465s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4645192623138428s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7670283317565918s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5515899658203125s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5568029880523682s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7833657264709473s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6841933727264404s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.7808070182800293s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8353831768035889s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5087378025054932s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.8177378177642822s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9224264621734619s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7038931846618652s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.1916353702545166s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0954580307006836s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5064713954925537s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.0878994464874268s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.0031926631927490234s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.52482008934021s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.190568447113037s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.029674768447876s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5371918678283691s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.211254835128784s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9992356300354004s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5568861961364746s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 306688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 306688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 314880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 314880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 314880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 314880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 3.5550520420074463s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1517488956451416s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5613963603973389s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.608468770980835s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3026208877563477s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6365609169006348s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.679625034332275s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3380742073059082s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7074956893920898s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 463872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 463872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 463872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 463872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 613376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 613376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 629760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 629760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 629760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 629760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 8.24333930015564s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.6616246700286865s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9932739734649658s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 6.838382720947266s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 729088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 729088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 761856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 761856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 761856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 761856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 927744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 927744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 927744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 927744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1226752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1226752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1259520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1259520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1259520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1259520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.333728551864624s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9741692543029785s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5558607578277588s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.7010092735290527s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0556650161743164s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5470688343048096s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.450766086578369s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0898175239562988s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8188917636871338s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.6552906036376953s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0663738250732422s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6016709804534912s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.5987627506256104s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0263035297393799s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6767642498016357s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.7210144996643066s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0204486846923828s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.614182710647583s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.7602884769439697s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0571949481964111s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6267237663269043s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.647933006286621s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9846975803375244s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5680670738220215s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.754499912261963s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0354676246643066s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.588914155960083s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.931586742401123s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1365456581115723s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6373190879821777s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.8904852867126465s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.14112305641174316s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5947198867797852s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.019486665725708s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1140179634094238s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6876914501190186s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.175428867340088s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.112687349319458s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8204576969146729s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.2454752922058105s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2166831493377686s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6668229103088379s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.3426685333251953s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.6171464920043945s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.639662504196167s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.8239479064941406s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.258443832397461s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6690225601196289s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.081692457199097s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.307067632675171s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6953957080841064s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.123711824417114s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.7064650058746338s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8023972511291504s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.322245359420776s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.376969575881958s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7755577564239502s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 281088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 281088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 364032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 380416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 380416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 6.028242111206055s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.8080382347106934s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8130929470062256s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 6.314016580581665s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.8047997951507568s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8820269107818604s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 7.452376842498779s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 363520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 363520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 363520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 363520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 529408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 529408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 562176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 562176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 562176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 562176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 728064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 728064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 760832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 760832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 760832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 760832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 11.448482990264893s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.280648231506348s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4616827964782715s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 727040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 727040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 727040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 727040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 860160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 860160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 925696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 925696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 925696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 925696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1058816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1058816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1124352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1124352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1124352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1124352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1456128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1456128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1521664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1521664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1521664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1521664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 5.428146839141846s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.640364408493042s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1616089344024658s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 5.564483642578125s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.6187920570373535s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2367215156555176s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 5.862403154373169s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.5825343132019043s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.245880126953125s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 5.880247354507446s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.4725282192230225s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2873585224151611s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 5.849554061889648s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.6708860397338867s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.281620740890503s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 6.745583772659302s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.5308899879455566s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.291445016860962s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 6.3013856410980225s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.6642332077026367s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5786309242248535s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 6.098564147949219s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.9687094688415527s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5861866474151611s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 6.518315076828003s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.761479139328003s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5685019493103027s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 6.746150255203247s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.955209970474243s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.9200007915496826s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 6.877080917358398s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.667490243911743s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3610637187957764s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 7.141212463378906s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.017885208129883s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4287219047546387s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 7.0148937702178955s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.1420817375183105s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4417452812194824s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 239360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 239360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 7.719076871871948s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.1883902549743652s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5184333324432373s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 8.484629154205322s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.5938735008239746s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5487051010131836s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 8.891971349716187s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.8961668014526367s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.0048236846923828125s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 9.257888555526733s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 346624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 346624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 379392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 379392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 379392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 379392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 478720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 478720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 12.265568017959595s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.9171974658966064s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.9982192516326904s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 12.480285167694092s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.878373861312866s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.379201889038086s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 429056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 429056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 693248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 693248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 758784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 758784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 758784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 758784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 957440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 957440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+[triton-dejavu] First execution including JIT compilation took 11.809521436691284s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.696657657623291s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 858112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 858112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1122304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1122304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1386496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1386496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1517568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1517568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1517568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1517568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1914880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1914880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.3908696174621582s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6094310283660889s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.3785121440887451s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4608709812164307s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6619534492492676s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4664590358734131s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5869903564453125s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6629056930541992s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4497408866882324s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.4750583171844482s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6932578086853027s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.46971654891967773s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5020930767059326s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6992261409759521s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4776594638824463s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5826818943023682s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7325599193572998s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5596516132354736s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5360822677612305s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7482321262359619s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.47079968452453613s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.6244652271270752s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6642742156982422s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.46008729934692383s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.7974128723144531s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7657811641693115s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.482452392578125s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.7750086784362793s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.993119478225708s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.46810221672058105s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.8732283115386963s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8175673484802246s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5403792858123779s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.8839638233184814s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9344329833984375s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4993572235107422s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.9738190174102783s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.921602725982666s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5270700454711914s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 254720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 254720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 2.9389686584472656s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9776608943939209s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.721153974533081s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.328566074371338s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1687307357788086s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7047884464263916s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.064958572387695s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2405052185058594s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7056465148925781s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 237056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 237056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 239104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 239104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 239104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 239104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 373248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 373248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 375296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 375296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 375296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 375296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 509440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 509440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 5.935272693634033s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.545255661010742s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8536858558654785s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 7.657502174377441s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.3809196949005127s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9746437072753906s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 474112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 474112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 478208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 478208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 478208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 478208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 610304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 610304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 614400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 614400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 614400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 614400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 746496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 746496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 750592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 750592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 750592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 750592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1018880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1018880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 22.241742372512817s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 6.0289146900177s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.4155397415161133s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 684032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 684032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 684032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 684032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 948224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 948224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 956416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 956416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 956416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 956416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1220608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1220608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1228800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1228800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1228800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1228800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1492992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1492992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1501184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1501184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1501184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1501184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2037760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2037760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 1.6654775142669678s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6987285614013672s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.46905040740966797s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.6758484840393066s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8101885318756104s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7105352878570557s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.8995463848114014s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8054959774017334s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.47978901863098145s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.7944765090942383s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8776211738586426s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5182280540466309s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.7438545227050781s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.814023494720459s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5291764736175537s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.2716925144195557s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8169729709625244s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.502709150314331s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.8031582832336426s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0514512062072754s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5342199802398682s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.9495587348937988s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8109526634216309s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5593302249908447s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.098154067993164s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8938405513763428s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5223357677459717s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.4524009227752686s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9562726020812988s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8658204078674316s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.5287926197052s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0612642765045166s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6138486862182617s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.1424713134765625s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.024322271347046s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5856528282165527s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.351151943206787s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1730172634124756s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6047327518463135s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 261888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 3.2446868419647217s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1245031356811523s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5670485496520996s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.817992925643921s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.297480583190918s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8131206035614014s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 4.6364405155181885s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.439967393875122s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7062406539916992s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 243200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 383488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 383488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 387584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 387584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 387584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 387584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 523776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 523776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 6.749169826507568s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.6596052646636963s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9074513912200928s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 8.860004186630249s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.4395840167999268s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0885822772979736s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 346112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 346112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 486400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 486400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 634880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 634880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 634880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 634880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 766976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 766976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 775168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 775168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 775168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 775168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1047552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1047552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 24.700168132781982s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 6.437432765960693s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.4199228286743164s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 692224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 692224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 972800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 972800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1269760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1269760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1269760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1269760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1533952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1533952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1550336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1550336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1550336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1550336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2095104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2095104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 3.121896266937256s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0428099632263184s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6234233379364014s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.9806389808654785s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2036631107330322s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6355829238891602s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.0641424655914307s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1515545845031738s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.646599292755127s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.120051383972168s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.219144582748413s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6336996555328369s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.227719306945801s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2983787059783936s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.648597002029419s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.2070751190185547s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1535370349884033s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7411158084869385s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.3133440017700195s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2201085090637207s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6529905796051025s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.082692861557007s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2449371814727783s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.5935671329498291s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.3032917976379395s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5225858688354492s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6467459201812744s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.437588930130005s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2381985187530518s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6682257652282715s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.7129526138305664s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2800922393798828s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.6987648010253906s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.6680898666381836s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3012008666992188s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.779672384262085s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.7857558727264404s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.3570668697357178s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8044769763946533s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 276224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 276224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 280320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 280320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 4.762149810791016s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.5667040348052979s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.7456066608428955s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 5.477536916732788s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.7072887420654297s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.8421585559844971s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 6.106162071228027s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.7843899726867676s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9055137634277344s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 255488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 255488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 412160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 412160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 412160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 412160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 552448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 552448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 560640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 560640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 560640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 560640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 8.529049634933472s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.0771217346191406s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1195790767669678s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 11.513381719589233s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.836657762527466s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.298607587814331s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 510976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 510976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 824320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 824320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 824320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 824320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1104896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1104896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1121280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1121280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1121280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1121280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 30.039280891418457s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 7.3039727210998535s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.613384246826172s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1021952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1021952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1351680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1351680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1351680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1351680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1615872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1615872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1648640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1648640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1648640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1648640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2209792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2209792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2242560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2242560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2242560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2242560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 6.6451661586761475s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.0081593990325928s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9920327663421631s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 7.377732992172241s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.00347900390625s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1749897003173828s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 7.304524898529053s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.9663889408111572s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.9999239444732666s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 7.501835346221924s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.0289182662963867s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2674453258514404s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 7.546030521392822s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.1539206504821777s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0315632820129395s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 7.50945520401001s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.068497657775879s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1107735633850098s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 7.639242649078369s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.118880033493042s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1335670948028564s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 7.542043685913086s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.1045868396759033s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.0928034782409668s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 8.052458047866821s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.2213785648345947s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.07881498336792s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 8.08164668083191s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.2511954307556152s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1090149879455566s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 8.324692964553833s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.2822751998901367s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1038963794708252s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 8.203450679779053s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.3637242317199707s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.166776418685913s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 8.533335447311401s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.439464569091797s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.1518568992614746s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 304896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 304896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 313088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 313088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 313088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 313088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] First execution including JIT compilation took 9.856246948242188s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.7604939937591553s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2217411994934082s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 10.088243961334229s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.0190610885620117s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.248652458190918s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 10.579221248626709s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.1951639652252197s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.2641730308532715s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 296448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 296448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 444928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 444928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 461312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 461312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 461312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 461312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 609792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 609792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 626176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 626176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 626176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 626176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+[triton-dejavu] First execution including JIT compilation took 4.474817276000977s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 1.7217485904693604s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 560128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 560128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 592896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 592896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 592896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 592896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 889856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 889856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 922624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 922624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 922624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 922624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1219584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1219584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1252352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1252352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1252352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1252352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+[triton-dejavu] First execution including JIT compilation took 9.443265199661255s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 3.8229305744171143s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1120256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1120256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1185792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1185792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1185792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1185792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1449984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1449984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1515520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1515520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1515520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1515520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1779712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1779712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1845248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1845248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1845248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1845248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2439168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2439168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2504704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2504704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2504704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2504704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
+    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
+RuntimeError: Triton Error [CUDA]: out of memory
+
+[triton-dejavu] First execution including JIT compilation took 5.715268850326538s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.512333869934082s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+[triton-dejavu] First execution including JIT compilation took 5.526895999908447s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 2.5083181858062744s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 279808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 279808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 279808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 279808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 362240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 378624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 378624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 329216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 361984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 361984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 361984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 361984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 559616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 559616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 559616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 559616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 724480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 724480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 757248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 757248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 328704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 328704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 328704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 328704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 658432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 658432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 723968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 723968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 723968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 723968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1119232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1119232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1119232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1119232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1448960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1448960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1514496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1514496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1514496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1514496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 657408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 657408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 657408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 657408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1052672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1052672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1052672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1052672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1316864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1316864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1447936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1447936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1447936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1447936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1712128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1712128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1843200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1843200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1843200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1843200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2107392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2107392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2238464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2238464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2238464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2238464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2897920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2897920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 3028992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 3028992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 3028992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 3028992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] added BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None for _chunk_scan_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default and key ('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')
+[2025-07-23 21:36:10] Triton autotuning for function _chunk_scan_fwd_kernel finished after 15278.82s; best config selected: BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None with benchmark time 0.014237518422305584;  evaluated 2625 configurations;
+[triton-dejavu] ('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16') not in cache, starting to tune...
+[triton-dejavu] [2025-07-23 21:36:10]  Started benchmarking of 2625 configurations... (use_bo: False, run: 0)
+[triton-dejavu] First execution including JIT compilation took 0.19137167930603027s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.19248533248901367s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.18099021911621094s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.20834088325500488s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.1988391876220703s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.2113637924194336s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.216780424118042s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.20966219902038574s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.4921605587005615s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.2258141040802002s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.22273588180541992s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.21141862869262695s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.265488862991333s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.22527718544006348s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.1997981071472168s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.29380369186401367s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.26201629638671875s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.20731806755065918s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.2723116874694824s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.27080583572387695s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.23759222030639648s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.1978166103363037s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.18923354148864746s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.21349525451660156s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.2810091972351074s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.22581052780151367s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.35887718200683594s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.2530679702758789s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.24747061729431152s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.24676847457885742s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.29694175720214844s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.25411462783813477s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.2558891773223877s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.25450968742370605s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.26735782623291016s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.29147815704345703s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.2686631679534912s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+[triton-dejavu] First execution including JIT compilation took 0.31691837310791016s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+[triton-dejavu] First execution including JIT compilation took 0.2369706630706787s.
+bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
+    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 277504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 277504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 418816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 418816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 560128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 560128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 242688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 242688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 326656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 326656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 261632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 289792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 289792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 439296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 439296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 253952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 253952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 407552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 407552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 407552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 407552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 236544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 236544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 269312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 269312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 269312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 269312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 270336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 270336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 473088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 473088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 538624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 538624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 538624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 538624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 351232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 351232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 401408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 401408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 702464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 702464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 290304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 290304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 580608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 580608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1161216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1161216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 302080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 302080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 376832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 376832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 376832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 376832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 604160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 604160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 604160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 604160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 334848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 334848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 334848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 334848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 669696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 669696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 669696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 669696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 351232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 351232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 401408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 401408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 702464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 702464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 232960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 232960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 332800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 332800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 398336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 398336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 465920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 465920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 532480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 532480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 665600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 665600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 796672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 796672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 796672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 796672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 931840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 931840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 347648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 347648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 695296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 695296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 794624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 794624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1390592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1390592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 352256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 352256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 352256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 352256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 704512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 704512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 704512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 704512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 261632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 298496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 298496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 372736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 372736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 372736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 372736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 596992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 596992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 596992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 596992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 446464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 446464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 446464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 446464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 745472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 745472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 745472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 745472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1193984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1193984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1193984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1193984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 290304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 290304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 580608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 580608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1161216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1161216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 347648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 347648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 695296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 695296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 794624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 794624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1390592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1390592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 247552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 330240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 395776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 395776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 462336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 462336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with Triton Error [CUDA]: out of memory
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
+    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
+                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: Triton Error [CUDA]: out of memory
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 660480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 660480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 791552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 791552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 791552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 791552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 924672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 924672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1056768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1056768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1320960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1320960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1583104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1583104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1583104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1583104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 1849344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1849344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+bench_cudagraph failed with out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+Traceback (most recent call last):
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
+    fn()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
+    return jit_first_time()
+           ^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
+    ret = self.call_lambda()
+          ^^^^^^^^^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
+    self.fn.run(
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
+    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
+    ^^^^^^^^^^
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
+    self._init_handles()
+  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
+    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
+triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
+
+[triton-dejavu] added BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 16, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None for _chunk_state_varlen_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default and key ('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16')
+[2025-07-24 03:00:55] Triton autotuning for function _chunk_state_varlen_kernel finished after 19485.39s; best config selected: BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 16, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None with benchmark time nan;  evaluated 2625 configurations;
+ERROR 07-24 03:00:55 [dump_input.py:69] Dumping input data for V1 LLM engine (v0.1.dev7919+g84c7525) with config: model='/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf', speculative_config=None, tokenizer='/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=132096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"/home/zrlngl/.cache/vllm/torch_compile_cache/9bcd1b9f98","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":512,"local_cache_dir":"/home/zrlngl/.cache/vllm/torch_compile_cache/9bcd1b9f98/rank_0_0/backbone"}, 
+ERROR 07-24 03:00:55 [dump_input.py:76] Dumping scheduler output for model execution: SchedulerOutput(scheduled_new_reqs=[NewRequestData(req_id=0,prompt_token_ids_len=64,mm_inputs=[],mm_hashes=[],mm_positions=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=True, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None),block_ids=([1], [2], [3], [4], [5], [6], [7], [8], [9], [10]),num_computed_tokens=0,lora_request=None)], scheduled_cached_reqs=CachedRequestData(req_ids=[], resumed_from_preemption=[], new_token_ids=[], new_block_ids=[], num_computed_tokens=[]), num_scheduled_tokens={0: 64}, total_num_scheduled_tokens=64, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, num_common_prefix_blocks=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], finished_req_ids=[], free_encoder_input_ids=[], structured_output_request_ids={}, grammar_bitmask=null, kv_connector_metadata=null)
+ERROR 07-24 03:00:55 [dump_input.py:79] Dumping scheduler stats: SchedulerStats(num_running_reqs=1, num_waiting_reqs=0, kv_cache_usage=0.009856630824372714, prefix_cache_stats=PrefixCacheStats(reset=False, requests=0, queries=0, hits=0), spec_decoding_stats=None, num_corrupted_reqs=0)
+ERROR 07-24 03:00:55 [core.py:615] EngineCore encountered a fatal error.
+ERROR 07-24 03:00:55 [core.py:615] Traceback (most recent call last):
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 606, in run_engine_core
+ERROR 07-24 03:00:55 [core.py:615]     engine_core.run_busy_loop()
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 633, in run_busy_loop
+ERROR 07-24 03:00:55 [core.py:615]     self._process_engine_step()
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 658, in _process_engine_step
+ERROR 07-24 03:00:55 [core.py:615]     outputs, model_executed = self.step_fn()
+ERROR 07-24 03:00:55 [core.py:615]                               ^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 262, in step
+ERROR 07-24 03:00:55 [core.py:615]     model_output = self.execute_model(scheduler_output)
+ERROR 07-24 03:00:55 [core.py:615]                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 248, in execute_model
+ERROR 07-24 03:00:55 [core.py:615]     raise err
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 239, in execute_model
+ERROR 07-24 03:00:55 [core.py:615]     return self.model_executor.execute_model(scheduler_output)
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/executor/abstract.py", line 87, in execute_model
+ERROR 07-24 03:00:55 [core.py:615]     output = self.collective_rpc("execute_model",
+ERROR 07-24 03:00:55 [core.py:615]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/executor/uniproc_executor.py", line 58, in collective_rpc
+ERROR 07-24 03:00:55 [core.py:615]     answer = run_method(self.driver_worker, method, args, kwargs)
+ERROR 07-24 03:00:55 [core.py:615]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/utils/__init__.py", line 2990, in run_method
+ERROR 07-24 03:00:55 [core.py:615]     return func(*args, **kwargs)
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
+ERROR 07-24 03:00:55 [core.py:615]     return func(*args, **kwargs)
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/worker/gpu_worker.py", line 327, in execute_model
+ERROR 07-24 03:00:55 [core.py:615]     output = self.model_runner.execute_model(scheduler_output,
+ERROR 07-24 03:00:55 [core.py:615]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
+ERROR 07-24 03:00:55 [core.py:615]     return func(*args, **kwargs)
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/worker/gpu_model_runner.py", line 1404, in execute_model
+ERROR 07-24 03:00:55 [core.py:615]     model_output = self.model(
+ERROR 07-24 03:00:55 [core.py:615]                    ^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+ERROR 07-24 03:00:55 [core.py:615]     return self._call_impl(*args, **kwargs)
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+ERROR 07-24 03:00:55 [core.py:615]     return forward_call(*args, **kwargs)
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/model_executor/models/granitemoehybrid.py", line 634, in forward
+ERROR 07-24 03:00:55 [core.py:615]     hidden_states = self.model(input_ids, positions, mamba_cache_params,
+ERROR 07-24 03:00:55 [core.py:615]                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/compilation/decorators.py", line 246, in __call__
+ERROR 07-24 03:00:55 [core.py:615]     model_output = self.forward(*args, **kwargs)
+ERROR 07-24 03:00:55 [core.py:615]                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/model_executor/models/granitemoehybrid.py", line 358, in forward
+ERROR 07-24 03:00:55 [core.py:615]     def forward(
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+ERROR 07-24 03:00:55 [core.py:615]     return self._call_impl(*args, **kwargs)
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+ERROR 07-24 03:00:55 [core.py:615]     return forward_call(*args, **kwargs)
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 838, in _fn
+ERROR 07-24 03:00:55 [core.py:615]     return fn(*args, **kwargs)
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 830, in call_wrapped
+ERROR 07-24 03:00:55 [core.py:615]     return self._wrapped_call(self, *args, **kwargs)
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 406, in __call__
+ERROR 07-24 03:00:55 [core.py:615]     raise e
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 393, in __call__
+ERROR 07-24 03:00:55 [core.py:615]     return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+ERROR 07-24 03:00:55 [core.py:615]     return self._call_impl(*args, **kwargs)
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+ERROR 07-24 03:00:55 [core.py:615]     return forward_call(*args, **kwargs)
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "<eval_with_key>.82", line 220, in forward
+ERROR 07-24 03:00:55 [core.py:615]     submod_1 = self.submod_1(getitem, s0, getitem_1);  getitem = submod_1 = None
+ERROR 07-24 03:00:55 [core.py:615]                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 830, in call_wrapped
+ERROR 07-24 03:00:55 [core.py:615]     return self._wrapped_call(self, *args, **kwargs)
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 406, in __call__
+ERROR 07-24 03:00:55 [core.py:615]     raise e
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 393, in __call__
+ERROR 07-24 03:00:55 [core.py:615]     return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
+ERROR 07-24 03:00:55 [core.py:615]     return self._call_impl(*args, **kwargs)
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
+ERROR 07-24 03:00:55 [core.py:615]     return forward_call(*args, **kwargs)
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "<eval_with_key>.2", line 5, in forward
+ERROR 07-24 03:00:55 [core.py:615]     mamba_mixer2 = torch.ops.vllm.mamba_mixer2(x_3, output, 'model.layers.0.mixer', None);  x_3 = output = mamba_mixer2 = None
+ERROR 07-24 03:00:55 [core.py:615]                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/_ops.py", line 1158, in __call__
+ERROR 07-24 03:00:55 [core.py:615]     return self._op(*args, **(kwargs or {}))
+ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/model_executor/layers/mamba/mamba_mixer2.py", line 749, in mamba_mixer2
+ERROR 07-24 03:00:55 [core.py:615]     self.forward_cuda(hidden_states=hidden_states,
+ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/model_executor/layers/mamba/mamba_mixer2.py", line 718, in forward_cuda
+ERROR 07-24 03:00:55 [core.py:615]     hidden_states = torch.vstack(ssd_output_list)
+ERROR 07-24 03:00:55 [core.py:615]                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ERROR 07-24 03:00:55 [core.py:615] torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.66 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.95 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+[(1, 64, 1), (1, 128, 1), (1, 512, 1), (1, 1024, 1), (1, 2048, 1), (1, 4096, 1)]
+====== Measuring batch_size 1, input length 64, output length 1 =====
+VLLM_USE_V1=1 python vllm-triton-backend/vllm/benchmarks/benchmark_latency.py --model /net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf --input-len 64 --output-len 1 --batch-size 1 --output-json /home/zrlngl/watsonx/zrl-triton-results-and-notebooks/vllm_benchmarks_latency/-net-storage149-autofs-css22-nmg-models-cos-1bfc857-fmaas-integration-tests-models-granite-4_0-small-base-pipecleaner-hf/NVIDIA_H100_80GB_HBM3/tuning_ignore/exp_2025-07-23_1140//result_bs_1_il_64_ol_1.json --num-iters-warmup 3 --num-iters 3 --tensor-parallel 1 
+benchmark command returned 256, stopping...

From 5a31ca4062b2876ef953a6afc04de27672346761 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Thu, 24 Jul 2025 05:02:36 -0400
Subject: [PATCH 16/61] testing for longer sequences

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                                    | 8 ++++++--
 scripts/bench_vllm_latency_range.py                       | 5 +++--
 vllm                                                      | 2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json
index 58f89f93d..4f831cc77 100755
--- a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json
@@ -1,16 +1,20 @@
 {
     "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_state_passing:_state_passing_fwd_kernel)",
-    "total_bench_time_s": 275.2601103782654,
+    "total_bench_time_s": 607.0304324626923,
     "evaluated_configs": 168,
     "keys": [
         "dim"
     ],
     "cache": {
-        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE: 512, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE: 512, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32')": "BLOCK_SIZE: 512, num_warps: 2, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
     },
     "timings": {
         "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')": [
             0.0030820679385215044
+        ],
+        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32')": [
+            0.13190822303295135
         ]
     },
     "timings_data": {
diff --git a/scripts/bench_vllm_latency_range.py b/scripts/bench_vllm_latency_range.py
index 4f3fe3a64..41ccc259b 100644
--- a/scripts/bench_vllm_latency_range.py
+++ b/scripts/bench_vllm_latency_range.py
@@ -50,7 +50,8 @@ def create_dir_if_not_exist(path, mode=0o777):
 selected_batch_sizes = [1]  # [4, 16, 32] #,128]
 # selected_input_lengths = [500]  # , 1000, 1500, 2000, 4000, 8000, 16000]
 # selected_output_lengths = [10, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
-selected_input_lengths = [64, 128, 512, 1024, 2048, 4096]
+# selected_input_lengths = [64, 128, 512, 1024, 2048, 4096]
+selected_input_lengths = [64, 128, 512, 1024, 2048, 4096, 8192, 31500]
 selected_output_lengths = [1]
 
 gpu_name = torch.cuda.get_device_name().replace(" ", "_").replace("/", "_")
@@ -66,7 +67,7 @@ def create_dir_if_not_exist(path, mode=0o777):
 max_num_prompts = 1000
 
 warmup_iterations = 3
-iterations = 3 
+iterations = 5 
 
 timestamp_f = datetime.now().strftime("%Y-%m-%d_%H%M")
 
diff --git a/vllm b/vllm
index 1f6be7ff0..0c1383a5d 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit 1f6be7ff01b67e2551e3b6b9b7a8933dc0553bd8
+Subproject commit 0c1383a5d528f7d988fc5c10f398926be060d09c

From fd1f12ea65c2145b3f68aee61bcb1e95f01d058b Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Thu, 24 Jul 2025 05:02:51 -0400
Subject: [PATCH 17/61] adding cache for baseline

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                        | 26 ++++++++++++++++
 .../default/cache.json                        | 25 +++++++++++++++
 .../default/cache.json                        | 31 +++++++++++++++++++
 .../default/cache.json                        | 26 ++++++++++++++++
 .../default/cache.json                        | 30 ++++++++++++++++++
 .../default/cache.json                        | 28 +++++++++++++++++
 6 files changed, 166 insertions(+)
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json

diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
new file mode 100755
index 000000000..efcde2e45
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
@@ -0,0 +1,26 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_bmm:_bmm_chunk_fwd_kernel)",
+    "total_bench_time_s": 4.903317928314209,
+    "evaluated_configs": 9,
+    "keys": [
+        "chunk_size",
+        "K",
+        "IS_CAUSAL"
+    ],
+    "cache": {
+        "('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 32, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": [
+            0.007391999941319227
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
new file mode 100755
index 000000000..df74c1fe3
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
@@ -0,0 +1,25 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_cumsum_fwd_kernel)",
+    "total_bench_time_s": 7.295067548751831,
+    "evaluated_configs": 7,
+    "keys": [
+        "chunk_size",
+        "nheads"
+    ],
+    "cache": {
+        "('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": "BLOCK_SIZE_H: 2, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('256', '128', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": [
+            0.007071999832987785
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json
new file mode 100755
index 000000000..fb9768114
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json
@@ -0,0 +1,31 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_scan:_chunk_scan_fwd_kernel)",
+    "total_bench_time_s": 22.759257316589355,
+    "evaluated_configs": 11,
+    "keys": [
+        "chunk_size",
+        "hdim",
+        "dstate",
+        "IS_CAUSAL"
+    ],
+    "cache": {
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 128, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32')": "BLOCK_SIZE_M: 32, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": [
+            0.014240000396966934
+        ],
+        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32')": [
+            0.8048959970474243
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
new file mode 100755
index 000000000..010c85ff2
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
@@ -0,0 +1,26 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_fwd_kernel)",
+    "total_bench_time_s": 5.0212812423706055,
+    "evaluated_configs": 9,
+    "keys": [
+        "hdim",
+        "dstate",
+        "chunk_size"
+    ],
+    "cache": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": [
+            0.009247999638319016
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
new file mode 100755
index 000000000..a81672d35
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
@@ -0,0 +1,30 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_varlen_kernel)",
+    "total_bench_time_s": 17.040932178497314,
+    "evaluated_configs": 9,
+    "keys": [
+        "hdim",
+        "dstate",
+        "chunk_size"
+    ],
+    "cache": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16')": [
+            0.009184000082314014
+        ],
+        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16')": [
+            0.009184000082314014
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json
new file mode 100755
index 000000000..634fae182
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json
@@ -0,0 +1,28 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_state_passing:_state_passing_fwd_kernel)",
+    "total_bench_time_s": 6.713695287704468,
+    "evaluated_configs": 6,
+    "keys": [
+        "dim"
+    ],
+    "cache": {
+        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE: 2048, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32')": "BLOCK_SIZE: 512, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')": [
+            0.009664000011980534
+        ],
+        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32')": [
+            0.1367039978504181
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": false
+    }
+}
\ No newline at end of file

From 600f5d8b0adc929bf8d8b161d97b7ac4bbcf5e5d Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Mon, 28 Jul 2025 06:05:07 -0400
Subject: [PATCH 18/61] measuring full cudagraphs, adding tuned fp8 moe

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 .../default/cache.json                        |   8 +
 .../default/cache.json                        |   8 +
 .../default/cache.json                        |   8 +
 .../default/cache.json                        |   8 +
 .../default/cache.json                        |   8 +
 .../default/cache.json                        |  27 ++++
 .../default/cache.json                        |   8 +
 .../default/cache.json                        |   8 +
 scripts/bench_vllm_latency_range.py           |   3 +
 triton-dejavu                                 |   2 +-
 vllm                                          |   2 +-
 13 files changed, 380 insertions(+), 2 deletions(-)
 create mode 100644 E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
 create mode 100644 E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-21ff5d19d1819793851ad7c7a60e8f4d7bd7bc84238d0302676bb9e213122e34/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-21ff5d19d1819793851ad7c7a60e8f4d7bd7bc84238d0302676bb9e213122e34/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json

diff --git a/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..ac556d936
--- /dev/null
+++ b/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 000000000..79fe4dbe7
--- /dev/null
+++ b/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
new file mode 100755
index 000000000..9808a0231
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_bmm:_bmm_chunk_fwd_kernel)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
new file mode 100755
index 000000000..d35417f40
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_scan:_chunk_scan_fwd_kernel)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
new file mode 100755
index 000000000..0bdded18d
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_fwd_kernel)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
new file mode 100755
index 000000000..ee569dcb6
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_varlen_kernel)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-21ff5d19d1819793851ad7c7a60e8f4d7bd7bc84238d0302676bb9e213122e34/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-21ff5d19d1819793851ad7c7a60e8f4d7bd7bc84238d0302676bb9e213122e34/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
new file mode 100755
index 000000000..550944b2a
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-21ff5d19d1819793851ad7c7a60e8f4d7bd7bc84238d0302676bb9e213122e34/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.mamba_ssm:_selective_scan_update_kernel)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-21ff5d19d1819793851ad7c7a60e8f4d7bd7bc84238d0302676bb9e213122e34/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-21ff5d19d1819793851ad7c7a60e8f4d7bd7bc84238d0302676bb9e213122e34/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
new file mode 100755
index 000000000..c7fb158cf
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-21ff5d19d1819793851ad7c7a60e8f4d7bd7bc84238d0302676bb9e213122e34/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
@@ -0,0 +1,27 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.mamba_ssm:_selective_scan_update_kernel)",
+    "total_bench_time_s": 154.3796603679657,
+    "evaluated_configs": 75,
+    "keys": [
+        "dstate",
+        "BLOCK_SIZE_DSTATE",
+        "dim",
+        "nheads_ngroups_ratio"
+    ],
+    "cache": {
+        "('128', '128', '64', '128')": "BLOCK_SIZE_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('128', '128', '64', '128')": [
+            1.7349423170089722
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
new file mode 100755
index 000000000..550944b2a
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.mamba_ssm:_selective_scan_update_kernel)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json
new file mode 100755
index 000000000..a62237df1
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_state_passing:_state_passing_fwd_kernel)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/scripts/bench_vllm_latency_range.py b/scripts/bench_vllm_latency_range.py
index 41ccc259b..9f675234d 100644
--- a/scripts/bench_vllm_latency_range.py
+++ b/scripts/bench_vllm_latency_range.py
@@ -113,6 +113,9 @@ def create_dir_if_not_exist(path, mode=0o777):
         f"--num-iters-warmup {warmup_iterations} "
         f"--num-iters {iterations} "
         f"--tensor-parallel {tp} "
+        f"--enable-chunked-prefill "
+        f"--max-num-batched-tokens 16384 "
+        f"-O.full_cuda_graph=true"
     )
     print(cmd)
     rv = os.system(cmd)
diff --git a/triton-dejavu b/triton-dejavu
index c2555ce1a..2c5616e18 160000
--- a/triton-dejavu
+++ b/triton-dejavu
@@ -1 +1 @@
-Subproject commit c2555ce1a61d2288007366b2dcef1203ed1f26ee
+Subproject commit 2c5616e1850ed54b24be2f31017f9f4d0fb74727
diff --git a/vllm b/vllm
index 0c1383a5d..bb8849aad 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit 0c1383a5d528f7d988fc5c10f398926be060d09c
+Subproject commit bb8849aadc3ab23bfe481fa6003e062cb2cc649c

From 408b462c65c7aab086eacfd7bd62db1ad4541396 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Mon, 28 Jul 2025 13:24:08 -0400
Subject: [PATCH 19/61] itl tuning

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                        | 27 +++++++++++++++++++
 scripts/bench_vllm_latency_range.py           | 15 +++++------
 vllm                                          |  2 +-
 3 files changed, 35 insertions(+), 9 deletions(-)
 create mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-12ef9e4125a78d954cad03c22e7b626a75d6e484131a7b8653f8b7d84d9f78f3/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json

diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-12ef9e4125a78d954cad03c22e7b626a75d6e484131a7b8653f8b7d84d9f78f3/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-12ef9e4125a78d954cad03c22e7b626a75d6e484131a7b8653f8b7d84d9f78f3/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
new file mode 100755
index 000000000..466963e92
--- /dev/null
+++ b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-12ef9e4125a78d954cad03c22e7b626a75d6e484131a7b8653f8b7d84d9f78f3/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
@@ -0,0 +1,27 @@
+{
+    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.mamba_ssm:_selective_scan_update_kernel)",
+    "total_bench_time_s": 201.7921507358551,
+    "evaluated_configs": 105,
+    "keys": [
+        "dstate",
+        "BLOCK_SIZE_DSTATE",
+        "dim",
+        "nheads_ngroups_ratio"
+    ],
+    "cache": {
+        "('128', '128', '64', '128', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32')": "BLOCK_SIZE_M: 64, num_warps: 2, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('128', '128', '64', '128', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32')": [
+            0.05485290288925171
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/scripts/bench_vllm_latency_range.py b/scripts/bench_vllm_latency_range.py
index 9f675234d..58ff41106 100644
--- a/scripts/bench_vllm_latency_range.py
+++ b/scripts/bench_vllm_latency_range.py
@@ -47,12 +47,15 @@ def create_dir_if_not_exist(path, mode=0o777):
     print(f"Usage: {sys.argv[0]} <model_path> <tp-factor> <testcase_name> <result_path>")
     exit(-1)
 
-selected_batch_sizes = [1]  # [4, 16, 32] #,128]
+# selected_batch_sizes = [1]  # [4, 16, 32] #,128]
 # selected_input_lengths = [500]  # , 1000, 1500, 2000, 4000, 8000, 16000]
 # selected_output_lengths = [10, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
 # selected_input_lengths = [64, 128, 512, 1024, 2048, 4096]
-selected_input_lengths = [64, 128, 512, 1024, 2048, 4096, 8192, 31500]
-selected_output_lengths = [1]
+# selected_input_lengths = [64, 128, 512, 1024, 2048, 4096, 8192, 31500]
+# selected_output_lengths = [1]
+selected_batch_sizes = [1, 2, 4, 8, 16, 32, 64]
+selected_input_lengths = [128]
+selected_output_lengths = [32, 128, 256]
 
 gpu_name = torch.cuda.get_device_name().replace(" ", "_").replace("/", "_")
 
@@ -62,10 +65,6 @@ def create_dir_if_not_exist(path, mode=0o777):
 testcase_name = sys.argv[3]
 result_path = os.path.abspath(sys.argv[4])
 
-# max_rounds = 128
-max_rounds = 64
-max_num_prompts = 1000
-
 warmup_iterations = 3
 iterations = 5 
 
@@ -115,7 +114,7 @@ def create_dir_if_not_exist(path, mode=0o777):
         f"--tensor-parallel {tp} "
         f"--enable-chunked-prefill "
         f"--max-num-batched-tokens 16384 "
-        f"-O.full_cuda_graph=true"
+        # f"-O.full_cuda_graph=true"
     )
     print(cmd)
     rv = os.system(cmd)
diff --git a/vllm b/vllm
index bb8849aad..aef0bfd6e 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit bb8849aadc3ab23bfe481fa6003e062cb2cc649c
+Subproject commit aef0bfd6ec4602b057e9c968347ef100dc533ae3

From 8b0994f91467fd33550c1ee71b6efb7c2724e8eb Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Tue, 29 Jul 2025 05:25:39 -0400
Subject: [PATCH 20/61] improving benchmark latency script

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/bench_vllm_latency_range.py | 60 +++++++++++++++++------------
 1 file changed, 36 insertions(+), 24 deletions(-)

diff --git a/scripts/bench_vllm_latency_range.py b/scripts/bench_vllm_latency_range.py
index 58ff41106..e729e5cba 100644
--- a/scripts/bench_vllm_latency_range.py
+++ b/scripts/bench_vllm_latency_range.py
@@ -20,7 +20,28 @@
 import sys
 import torch
 from datetime import datetime
-from itertools import zip_longest, repeat, chain
+from itertools import zip_longest, repeat, chain, product
+
+
+# ================= SETUP
+
+# selected_batch_sizes = [1]  # [4, 16, 32] #,128]
+# selected_input_lengths = [500]  # , 1000, 1500, 2000, 4000, 8000, 16000]
+# selected_output_lengths = [10, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
+# selected_input_lengths = [64, 128, 512, 1024, 2048, 4096]
+# selected_input_lengths = [64, 128, 512, 1024, 2048, 4096, 8192, 31500]
+# selected_output_lengths = [1]
+selected_batch_sizes = [1, 2, 4, 8, 16, 32, 64]
+selected_input_lengths = [128]
+selected_output_lengths = [32, 128, 256]
+
+# use_cross_product = False
+use_cross_product = True
+
+warmup_iterations = 3
+iterations = 5 
+
+# =================
 
 
 def create_dir_if_not_exist_recursive(path, mode=0o777):
@@ -47,15 +68,6 @@ def create_dir_if_not_exist(path, mode=0o777):
     print(f"Usage: {sys.argv[0]} <model_path> <tp-factor> <testcase_name> <result_path>")
     exit(-1)
 
-# selected_batch_sizes = [1]  # [4, 16, 32] #,128]
-# selected_input_lengths = [500]  # , 1000, 1500, 2000, 4000, 8000, 16000]
-# selected_output_lengths = [10, 100, 200, 400, 800, 1600, 3200, 6400, 12800]
-# selected_input_lengths = [64, 128, 512, 1024, 2048, 4096]
-# selected_input_lengths = [64, 128, 512, 1024, 2048, 4096, 8192, 31500]
-# selected_output_lengths = [1]
-selected_batch_sizes = [1, 2, 4, 8, 16, 32, 64]
-selected_input_lengths = [128]
-selected_output_lengths = [32, 128, 256]
 
 gpu_name = torch.cuda.get_device_name().replace(" ", "_").replace("/", "_")
 
@@ -65,8 +77,6 @@ def create_dir_if_not_exist(path, mode=0o777):
 testcase_name = sys.argv[3]
 result_path = os.path.abspath(sys.argv[4])
 
-warmup_iterations = 3
-iterations = 5 
 
 timestamp_f = datetime.now().strftime("%Y-%m-%d_%H%M")
 
@@ -83,19 +93,21 @@ def create_dir_if_not_exist(path, mode=0o777):
         print(f"can't find benchmark script benchmark_latency.py")
         exit(-1)
 
-max_length = max(len(selected_batch_sizes), len(selected_input_lengths), len(selected_output_lengths))
-zipped_lists = list(
-    zip_longest(
-        chain(selected_batch_sizes, 
-              repeat(selected_batch_sizes[-1], times=max_length-len(selected_batch_sizes))),
-        chain(selected_input_lengths, 
-              repeat(selected_input_lengths[-1], times=max_length-len(selected_input_lengths))),
-        chain(selected_output_lengths, 
-              repeat(selected_output_lengths[-1], times=max_length-len(selected_output_lengths))),
-        fillvalue=None,
+if use_cross_product:
+    zipped_lists = list(product(selected_batch_sizes, selected_input_lengths, selected_output_lengths))
+else:
+    max_length = max(len(selected_batch_sizes), len(selected_input_lengths), len(selected_output_lengths))
+    zipped_lists = list(
+        zip_longest(
+            chain(selected_batch_sizes, 
+                  repeat(selected_batch_sizes[-1], times=max_length-len(selected_batch_sizes))),
+            chain(selected_input_lengths, 
+                  repeat(selected_input_lengths[-1], times=max_length-len(selected_input_lengths))),
+            chain(selected_output_lengths, 
+                  repeat(selected_output_lengths[-1], times=max_length-len(selected_output_lengths))),
+            fillvalue=None,
+        )
     )
-)
-
 print(zipped_lists)
 
 

From 023a5a5205a00c0e1803463a202aa7f668b38813 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Tue, 29 Jul 2025 06:10:21 -0400
Subject: [PATCH 21/61] adding simple heuristics to unit tests

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .gitignore                                    |   2 +
 .../ibm_triton_lib/kernels/__init__.py        |   1 +
 .../default/cache.json                        |   8 +
 .../default/cache.json                        |   8 +
 .../default/cache.json                        |  35 +
 .../triton_unified_attention_simple.py        | 756 ++++++++++++++++++
 scripts/bench_vllm_latency_range.py           |   4 +-
 scripts/benchmark.py                          |   8 +-
 scripts/callers/__init__.py                   |   1 +
 scripts/callers/unified_triton.py             |  69 +-
 scripts/requirements.txt                      |   2 +
 scripts/setups/prefix_tune_2d.conf            |   4 +-
 12 files changed, 893 insertions(+), 5 deletions(-)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 create mode 100644 ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_simple.py
 create mode 100644 scripts/requirements.txt

diff --git a/.gitignore b/.gitignore
index ccc868826..a30a2dc84 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,5 @@ ShareGPT_V3_unfiltered_cleaned_split.json
 
 .vscode/settings.json
 
+ibm-triton-lib/ibm_triton_lib.egg-info/
+
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py b/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
index 2a97f6023..9b28ea4d2 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
@@ -67,6 +67,7 @@ def ConfigSpace(
 )
 
 from .triton_unified_attention import unified_attention
+from .triton_unified_attention_simple import unified_attention as unified_attention_simple
 
 from .mamba_ssm import selective_state_update
 
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
new file mode 100755
index 000000000..c2b3452bf
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
new file mode 100755
index 000000000..2540ac5c3
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
new file mode 100755
index 000000000..a83cef97e
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
@@ -0,0 +1,35 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
+    "total_bench_time_s": 363.07500290870667,
+    "evaluated_configs": 540,
+    "keys": [
+        "MAX_SEQ_Q",
+        "MAX_SEQ_K",
+        "AVG_SEQ_Q",
+        "AVG_SEQ_K",
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3"
+    ],
+    "cache": {
+        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 256, BLOCK_M: 512, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            4.2064047534040583e-07
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_simple.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_simple.py
new file mode 100644
index 000000000..cbbc52250
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_simple.py
@@ -0,0 +1,756 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Authors:
+#  - Burkhard Ringlein <ngl@zurich.ibm.com>
+#  - Jan van Lunteren <jvl@zurich.ibm.com>
+#  - Chih-Chieh Yang <chih.chieh.yang@ibm.com>
+#  - Thomas Parnell <tpa@zurich.ibm.com>
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.triton_utils import tl, triton
+
+logger = init_logger(__name__)
+
+
+@triton.jit
+def cdiv_fn(x, y):
+    return (x + y - 1) // y
+
+
+@triton.jit
+def apply_softcap(S, x):
+    Sdiv = S / x
+    p1 = tl.exp(Sdiv)
+    p2 = tl.exp(-Sdiv)
+    return x * (p1 - p2) / (p1 + p2)
+
+
+@triton.jit
+def find_seq_idx(query_start_len_ptr, target_idx, num_seqs,
+                 BLOCK_Q: tl.constexpr, use_q_block_mode: tl.constexpr):
+    left: tl.int32 = 0
+    right = num_seqs
+    while left < right:
+        mid = (left + right) // 2
+        val = tl.load(query_start_len_ptr + mid)
+        mid_val = val // BLOCK_Q + mid if use_q_block_mode else val
+
+        if mid_val <= target_idx:
+            left = mid + 1
+        else:
+            right = mid
+
+    return left - 1
+
+
+@triton.jit
+def kernel_unified_attention_2d(
+        output_ptr,  # [num_tokens, num_query_heads, head_size]
+        query_ptr,  # [num_tokens, num_query_heads, head_size]
+        key_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+        value_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+        seq_lens_ptr,  # [num_seqs]
+        alibi_slopes_ptr,  # [num_query_heads]
+        scale,  # float32
+        k_scale,  # float32
+        v_scale,  # float32
+        softcap,  # float32
+        num_query_heads: tl.constexpr,  # int
+        num_queries_per_kv: tl.constexpr,  # int
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
+        BLOCK_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        USE_SOFTCAP: tl.constexpr,  # bool
+        SLIDING_WINDOW: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.constexpr,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.constexpr,  # int
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        num_seqs: tl.int32,
+        BLOCK_M: tl.constexpr,  # int
+        BLOCK_N: tl.constexpr,  # int
+):
+    q_block_global_idx = tl.program_id(0)
+    kv_head_idx = tl.program_id(1)
+
+    seq_idx = find_seq_idx(query_start_len_ptr, q_block_global_idx, num_seqs,
+                           BLOCK_Q, True)
+
+    q_block_start_idx = tl.load(query_start_len_ptr +
+                                seq_idx) // BLOCK_Q + seq_idx
+
+    q_block_local_idx = q_block_global_idx - q_block_start_idx
+
+    cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
+    cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1)
+
+    cur_batch_query_len = cur_batch_in_all_stop_index \
+        - cur_batch_in_all_start_index
+
+    if q_block_local_idx * BLOCK_Q >= cur_batch_query_len:
+        return
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, HEAD_SIZE_PADDED)
+    query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
+
+    query_offset_0 = cur_batch_in_all_start_index + query_pos
+    query_offset_1 = kv_head_idx * num_queries_per_kv + \
+        offs_m % num_queries_per_kv
+    query_offset = (query_offset_0[:, None] * query_stride_0 +
+                    query_offset_1[:, None] * query_stride_1 + offs_d[None, :])
+
+    dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1)
+    query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1)
+    query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1)
+
+    # Q : (BLOCK_M, HEAD_SIZE_PADDED)
+    Q = tl.load(
+        query_ptr + query_offset,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+        other=0.0,
+    )
+
+    block_table_offset = seq_idx * block_table_stride
+
+    M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    L = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32)
+
+    # sequence len for this particular sequence
+    seq_len = tl.load(seq_lens_ptr + seq_idx)
+
+    # context length for this particular sequences
+    context_len = seq_len - cur_batch_query_len
+
+    # alibi slope for this head
+    if USE_ALIBI_SLOPES:
+        alibi_slope = tl.load(alibi_slopes_ptr + query_offset_1,
+                              mask=query_mask_1,
+                              other=0.0)
+
+    # compute the length of the longest sequence prefix spanned by any
+    # query token in the current q_block (q_block_local_idx)
+    max_seq_prefix_len = context_len + q_block_local_idx * BLOCK_Q + (
+        BLOCK_M - 1) // num_queries_per_kv + 1
+
+    # adjust for potential padding in the last q_block by considering the
+    # actual sequence length
+    max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
+
+    offs_n = tl.arange(0, BLOCK_N)
+
+    # iterate through tiles (below the mask)
+    # The loop iterates only until the longest sequence. Due to causal
+    # masking, blocks beyond this prefix can be skipped.
+    for start_n in range(0, max_seq_prefix_len, BLOCK_N):
+
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+
+        physical_block_idx = tl.load(block_tables_ptr + block_table_offset +
+                                     (start_n + offs_n) // BLOCK_SIZE,
+                                     mask=(start_n + offs_n) < seq_len,
+                                     other=0)
+
+        v_offset = (physical_block_idx[:, None] * stride_v_cache_0 +
+                    kv_head_idx * stride_v_cache_2 +
+                    offs_d[None, :] * stride_v_cache_3 +
+                    (offs_n[:, None] % BLOCK_SIZE) * stride_v_cache_1)
+
+        k_offset = (physical_block_idx[None, :] * stride_k_cache_0 +
+                    kv_head_idx * stride_k_cache_2 +
+                    offs_d[:, None] * stride_k_cache_3 +
+                    (offs_n[None, :] % BLOCK_SIZE) * stride_k_cache_1)
+
+        # K : (HEAD_SIZE_PADDED, BLOCK_N)
+        K_load = tl.load(key_cache_ptr + k_offset,
+                         mask=dim_mask[:, None],
+                         other=0.0)
+
+        if K_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                K = K_load
+            else:
+                K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype)
+        else:
+            K = K_load
+
+        # V : (BLOCK_N, HEAD_SIZE_PADDED)
+        V_load = tl.load(value_cache_ptr + v_offset,
+                         mask=dim_mask[None, :],
+                         other=0.0)
+
+        if V_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                V = V_load
+            else:
+                V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype)
+        else:
+            V = V_load
+
+        seq_offset = start_n + tl.arange(0, BLOCK_N)
+
+        # seq_mask: (BLOCK_M, BLOCK_N)
+        seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
+
+        # S : (BLOCK_M, BLOCK_N)
+        S = tl.zeros(shape=(BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+        S += scale * tl.dot(Q, K)
+
+        if USE_SOFTCAP:
+            S = apply_softcap(S, softcap)
+
+        S = tl.where(query_mask_1[:, None] & query_mask_0[:, None] & seq_mask,
+                     S, float("-inf"))
+
+        if SLIDING_WINDOW > 0:
+            S = tl.where((context_len + query_pos[:, None] - seq_offset)
+                         < SLIDING_WINDOW, S, float("-inf"))
+
+        if USE_ALIBI_SLOPES:
+            S += alibi_slope[:, None] * (seq_offset - context_len)
+
+        # compute running maximum
+        # m_j : (BLOCK_M,)
+        m_j = tl.maximum(M, tl.max(S, axis=1))
+        # For sliding window there's a chance the max is -inf due to masking of
+        # the entire row. In this case we need to set m_j 0 to avoid NaN
+        m_j = tl.where(m_j > float("-inf"), m_j, 0.0)
+
+        # P : (BLOCK_M, BLOCK_N)
+        P = tl.exp(S - m_j[:, None])
+
+        # l_j : (BLOCK_M,)
+        l_j = tl.sum(P, axis=1)
+
+        # alpha : (BLOCK_M, )
+        alpha = tl.exp(M - m_j)
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc = acc * alpha[:, None]
+
+        # update constants
+        L = L * alpha + l_j
+        M = m_j
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc += tl.dot(P.to(V.dtype), V)
+
+    # epilogue
+    acc = acc / L[:, None]
+
+    output_offset = (query_offset_0[:, None] * output_stride_0 +
+                     query_offset_1[:, None] * output_stride_1 +
+                     offs_d[None, :])
+
+    tl.store(
+        output_ptr + output_offset,
+        acc,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+    )
+
+
+@triton.jit
+def kernel_unified_attention_3d(
+        segm_output_ptr,
+        # [num_tokens, num_query_heads, num_segments, head_size]
+        segm_max_ptr,  # [num_tokens, num_query_heads, num_segments]
+        segm_expsum_ptr,  # [num_tokens, num_query_heads, num_segments]
+        query_ptr,  # [num_tokens, num_query_heads, head_size]
+        key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
+        value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
+        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+        seq_lens_ptr,  # [num_seqs]
+        alibi_slopes_ptr,  # [num_query_heads]
+        scale,  # float32
+        k_scale,  # float32
+        v_scale,  # float32
+        softcap,  # float32
+        num_query_heads: tl.constexpr,  # int
+        num_queries_per_kv: tl.constexpr,  # int
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        BLOCK_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        USE_SOFTCAP: tl.constexpr,  # bool
+        SLIDING_WINDOW: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.constexpr,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.constexpr,  # int
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        num_seqs: tl.int32,
+        BLOCK_M: tl.constexpr,  # int
+        NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+):
+    q_block_global_idx = tl.program_id(0)
+    kv_head_idx = tl.program_id(1)
+    segm_idx = tl.program_id(2)
+
+    seq_idx = find_seq_idx(query_start_len_ptr, q_block_global_idx, num_seqs,
+                           BLOCK_Q, True)
+
+    q_block_start_idx = tl.load(query_start_len_ptr +
+                                seq_idx) // BLOCK_Q + seq_idx
+
+    q_block_local_idx = q_block_global_idx - q_block_start_idx
+
+    cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
+    cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1)
+
+    cur_batch_query_len = cur_batch_in_all_stop_index \
+        - cur_batch_in_all_start_index
+
+    if q_block_local_idx * BLOCK_Q >= cur_batch_query_len:
+        return
+
+    # sequence len for this particular sequence
+    seq_len = tl.load(seq_lens_ptr + seq_idx)
+
+    # number of segments for this particular sequence
+    num_segments = NUM_SEGMENTS_PER_SEQ
+    blocks_per_segment = cdiv_fn(seq_len, num_segments * BLOCK_SIZE)
+
+    if segm_idx * blocks_per_segment * BLOCK_SIZE >= seq_len:
+        return
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, HEAD_SIZE_PADDED)
+
+    query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
+
+    query_offset_0 = cur_batch_in_all_start_index + query_pos
+    query_offset_1 = kv_head_idx * num_queries_per_kv + \
+        offs_m % num_queries_per_kv
+
+    query_offset = (query_offset_0[:, None] * query_stride_0 +
+                    query_offset_1[:, None] * query_stride_1 + offs_d[None, :])
+
+    dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1)
+    query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1)
+    query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1)
+
+    # Q : (BLOCK_M, HEAD_SIZE_PADDED)
+    Q = tl.load(
+        query_ptr + query_offset,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+        other=0.0,
+    )
+
+    block_table_offset = seq_idx * block_table_stride
+
+    M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    L = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32)
+
+    # context length for this particular sequences
+    context_len = seq_len - cur_batch_query_len
+
+    # alibi slope for this head
+    if USE_ALIBI_SLOPES:
+        alibi_slope = tl.load(alibi_slopes_ptr + query_offset_1,
+                              mask=query_mask_1,
+                              other=0.0)
+
+    num_blocks = cdiv_fn(seq_len, BLOCK_SIZE)
+
+    # iterate through tiles within current segment
+    for j in range(
+            segm_idx * blocks_per_segment,
+            min((segm_idx + 1) * blocks_per_segment, num_blocks),
+    ):
+        physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j)
+
+        offs_n = tl.arange(0, BLOCK_SIZE)
+
+        v_offset = (physical_block_idx * stride_v_cache_0 +
+                    kv_head_idx * stride_v_cache_2 +
+                    offs_d[None, :] * stride_v_cache_3 +
+                    offs_n[:, None] * stride_v_cache_1)
+
+        k_offset = (physical_block_idx * stride_k_cache_0 +
+                    kv_head_idx * stride_k_cache_2 +
+                    offs_d[:, None] * stride_k_cache_3 +
+                    offs_n[None, :] * stride_k_cache_1)
+
+        # K : (HEAD_SIZE, BLOCK_SIZE)
+        K_load = tl.load(key_cache_ptr + k_offset,
+                         mask=dim_mask[:, None],
+                         other=0.0)
+
+        if K_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                K = K_load
+            else:
+                K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype)
+        else:
+            K = K_load
+
+        # V : (BLOCK_SIZE, HEAD_SIZE)
+        V_load = tl.load(value_cache_ptr + v_offset,
+                         mask=dim_mask[None, :],
+                         other=0.0)
+
+        if V_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                V = V_load
+            else:
+                V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype)
+        else:
+            V = V_load
+
+        seq_offset = j * BLOCK_SIZE + offs_n
+
+        seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
+
+        # S : (BLOCK_M, BLOCK_SIZE)
+        S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32)
+
+        S += scale * tl.dot(Q, K)
+
+        if USE_SOFTCAP:
+            S = apply_softcap(S, softcap)
+
+        S = tl.where(query_mask_1[:, None] & query_mask_0[:, None] & seq_mask,
+                     S, float("-inf"))
+
+        if SLIDING_WINDOW > 0:
+            S = tl.where((context_len + query_pos[:, None] - seq_offset)
+                         < SLIDING_WINDOW, S, float("-inf"))
+
+        if USE_ALIBI_SLOPES:
+            S += alibi_slope[:, None] * (seq_offset - context_len)
+
+        # compute running maximum
+        # m_j : (BLOCK_M,)
+        m_j = tl.maximum(M, tl.max(S, axis=1))
+        # For sliding window there's a chance the max is -inf due to masking of
+        # the entire row. In this case we need to set m_j 0 to avoid NaN
+        m_j = tl.where(m_j > float("-inf"), m_j, 0.0)
+
+        # P : (BLOCK_M, BLOCK_SIZE,)
+        P = tl.exp(S - m_j[:, None])
+
+        # l_j : (BLOCK_M,)
+        l_j = tl.sum(P, axis=1)
+
+        # alpha : (BLOCK_M, )
+        alpha = tl.exp(M - m_j)
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc = acc * alpha[:, None]
+
+        # update constants
+        L = L * alpha + l_j
+        M = m_j
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc += tl.dot(P.to(V.dtype), V)
+
+    segm_output_offset = (
+        query_offset_0[:, None].to(tl.int64) *
+        (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        query_offset_1[:, None] * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        segm_idx * HEAD_SIZE_PADDED + tl.arange(0, HEAD_SIZE_PADDED)[None, :])
+    tl.store(
+        segm_output_ptr + segm_output_offset,
+        acc,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+    )
+    segm_offset = (query_offset_0.to(tl.int64) *
+                   (num_query_heads * NUM_SEGMENTS_PER_SEQ) +
+                   query_offset_1 * NUM_SEGMENTS_PER_SEQ + segm_idx)
+    tl.store(segm_max_ptr + segm_offset, M, mask=query_mask_0 & query_mask_1)
+    tl.store(segm_expsum_ptr + segm_offset,
+             L,
+             mask=query_mask_0 & query_mask_1)
+
+
+@triton.jit
+def reduce_segments(
+        output_ptr,  # [num_tokens, num_query_heads, head_size]
+        segm_output_ptr,
+        #[num_tokens, num_query_heads, max_num_segments, head_size]
+        segm_max_ptr,  # [num_tokens, num_query_heads, max_num_segments]
+        segm_expsum_ptr,  # [num_tokens, num_query_heads, max_num_segments]
+        seq_lens_ptr,  # [num_seqs]
+        num_seqs,  # int
+        num_query_heads: tl.constexpr,  # int
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
+        block_table_stride: tl.int64,  # int
+        BLOCK_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int, must be power of 2
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+):
+    query_token_idx = tl.program_id(0)
+    query_head_idx = tl.program_id(1)
+
+    seq_idx = find_seq_idx(query_start_len_ptr, query_token_idx, num_seqs,
+                           BLOCK_Q, False)
+
+    # sequence len for this particular sequence
+    seq_len = tl.load(seq_lens_ptr + seq_idx)
+
+    # number of segments for this particular sequence
+    num_segments = NUM_SEGMENTS_PER_SEQ
+    blocks_per_segment = cdiv_fn(seq_len, num_segments * BLOCK_SIZE)
+
+    # create masks for subsequent loads
+    act_num_segments = cdiv_fn(seq_len, blocks_per_segment * BLOCK_SIZE)
+    segm_mask = tl.arange(0, NUM_SEGMENTS_PER_SEQ) < tl.full(
+        [NUM_SEGMENTS_PER_SEQ], act_num_segments, dtype=tl.int32)
+    dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1,
+                        0).to(tl.int1)
+
+    # load segment maxima
+    segm_offset = (query_token_idx.to(tl.int64) *
+                   (num_query_heads * NUM_SEGMENTS_PER_SEQ) +
+                   query_head_idx * NUM_SEGMENTS_PER_SEQ +
+                   tl.arange(0, NUM_SEGMENTS_PER_SEQ))
+    segm_max = tl.load(segm_max_ptr + segm_offset,
+                       mask=segm_mask,
+                       other=float("-inf"))
+    overall_max = tl.max(segm_max)
+
+    # load and rescale segment exp sums
+    segm_expsum = tl.load(segm_expsum_ptr + segm_offset,
+                          mask=segm_mask,
+                          other=0.0)
+    segm_expsum = segm_expsum * tl.exp(segm_max - overall_max)
+    overall_expsum = tl.sum(segm_expsum)
+
+    # load, rescale, and add segment attention outputs
+    segm_output_offset = (
+        query_token_idx.to(tl.int64) *
+        (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        query_head_idx * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        tl.arange(0, NUM_SEGMENTS_PER_SEQ)[:, None] * HEAD_SIZE_PADDED +
+        tl.arange(0, HEAD_SIZE_PADDED)[None, :])
+    segm_output = tl.load(
+        segm_output_ptr + segm_output_offset,
+        mask=segm_mask[:, None] & dim_mask[None, :],
+        other=0.0,
+    )
+    segm_output *= tl.exp(segm_max - overall_max)[:, None]
+    acc_sum = tl.sum(segm_output, axis=0)
+    # safely divide by overall_expsum, returning 0.0 if overall_expsum is 0
+    acc = tl.where(overall_expsum == 0.0, 0.0, acc_sum / overall_expsum)
+
+    # write result
+    output_offset = (query_token_idx * output_stride_0 +
+                     query_head_idx * output_stride_1 +
+                     tl.arange(0, HEAD_SIZE_PADDED))
+    tl.store(output_ptr + output_offset, acc, mask=dim_mask)
+
+
+def unified_attention(
+    q,
+    k,
+    v,
+    out,
+    cu_seqlens_q,
+    max_seqlen_q,
+    seqused_k,
+    max_seqlen_k,
+    softmax_scale,
+    causal,
+    window_size,
+    block_table,
+    softcap,
+    q_descale,
+    k_descale,
+    v_descale,
+    alibi_slopes=None,
+):
+    assert causal, "Only causal attention is supported"
+    assert q_descale is None, "Q scales not supported"
+
+    block_size = v.shape[1]
+    assert q.element_size() >= 2 or block_size >= 32, \
+        "Block size must be at least 32 for fp8"
+
+    use_alibi_slopes = alibi_slopes is not None
+
+    block_size = v.shape[1]
+    num_seqs = len(seqused_k)
+    num_query_heads = q.shape[1]
+    num_kv_heads = k.shape[2]
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    head_size = q.shape[2]
+
+    # balancing the blocksizes for short and long prompts
+    BLOCK_M = 16
+    BLOCK_N = block_size
+    BLOCK_Q = BLOCK_M // num_queries_per_kv
+
+    # Ideally we would launch with kernel with:
+    # \sum_i[ceil(query_len[i] / BLOCK_Q)] blocks.
+    # However, it is slow to realize the query_lens on cpu.
+    # Instead we use upper-bound:
+    # \sum_i[ceil(query_len[i] / BLOCK_Q)]
+    #   <= \sum_i[floor(query_len[i] / BLOCK_Q) + 1]
+    #    = \sum_i[floor(query_len[i] / BLOCK_Q)] + num_seqs
+    #   <= floor(\sum_i(query_len[i]) / BLOCK_Q) + num_seqs
+    #    = floor(q.shape[0] / BLOCK_Q) + num_seqs
+    total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs
+
+    # if batch contains a prefill
+    grid = lambda META: (q.shape[0] // (META[
+        'BLOCK_M'] // num_queries_per_kv) + num_seqs, num_kv_heads)
+
+    kernel_unified_attention_2d[grid](
+        output_ptr=out,
+        query_ptr=q,
+        key_cache_ptr=k,
+        value_cache_ptr=v,
+        block_tables_ptr=block_table,
+        seq_lens_ptr=seqused_k,
+        alibi_slopes_ptr=alibi_slopes,
+        scale=softmax_scale,
+        k_scale=k_descale,
+        v_scale=v_descale,
+        softcap=softcap,
+        num_query_heads=num_query_heads,
+        num_queries_per_kv=num_queries_per_kv,
+        block_table_stride=block_table.stride(0),
+        query_stride_0=q.stride(0),
+        query_stride_1=q.stride(1),
+        output_stride_0=out.stride(0),
+        output_stride_1=out.stride(1),
+        BLOCK_SIZE=block_size,
+        HEAD_SIZE=head_size,
+        HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+        USE_ALIBI_SLOPES=use_alibi_slopes,
+        USE_SOFTCAP=(softcap > 0),
+        SLIDING_WINDOW=(1 + window_size[0]),
+        stride_k_cache_0=k.stride(0),
+        stride_k_cache_1=k.stride(1),
+        stride_k_cache_2=k.stride(2),
+        stride_k_cache_3=k.stride(3),
+        stride_v_cache_0=v.stride(0),
+        stride_v_cache_1=v.stride(1),
+        stride_v_cache_2=v.stride(2),
+        stride_v_cache_3=v.stride(3),
+        query_start_len_ptr=cu_seqlens_q,
+        BLOCK_Q=BLOCK_Q,
+        num_seqs=num_seqs,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        num_stages=4,
+    )
+    # else:
+    #     # for initial version, NUM_SEGMENTS = 16 is chosen as a default
+    #     # value that showed good performance in tests
+    #     NUM_SEGMENTS = 16
+
+    #     segm_output = torch.empty(
+    #         q.shape[0],
+    #         num_query_heads,
+    #         NUM_SEGMENTS,
+    #         triton.next_power_of_2(head_size),
+    #         dtype=torch.float32,
+    #         device=q.device,
+    #     )
+    #     segm_max = torch.empty(
+    #         q.shape[0],
+    #         num_query_heads,
+    #         NUM_SEGMENTS,
+    #         dtype=torch.float32,
+    #         device=q.device,
+    #     )
+    #     segm_expsum = torch.empty(
+    #         q.shape[0],
+    #         num_query_heads,
+    #         NUM_SEGMENTS,
+    #         dtype=torch.float32,
+    #         device=q.device,
+    #     )
+
+    #     kernel_unified_attention_3d[(
+    #         total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)](
+    #             segm_output_ptr=segm_output,
+    #             segm_max_ptr=segm_max,
+    #             segm_expsum_ptr=segm_expsum,
+    #             query_ptr=q,
+    #             key_cache_ptr=k,
+    #             value_cache_ptr=v,
+    #             block_tables_ptr=block_table,
+    #             seq_lens_ptr=seqused_k,
+    #             alibi_slopes_ptr=alibi_slopes,
+    #             scale=softmax_scale,
+    #             k_scale=k_descale,
+    #             v_scale=v_descale,
+    #             softcap=softcap,
+    #             num_query_heads=num_query_heads,
+    #             num_queries_per_kv=num_queries_per_kv,
+    #             block_table_stride=block_table.stride(0),
+    #             query_stride_0=q.stride(0),
+    #             query_stride_1=q.stride(1),
+    #             BLOCK_SIZE=block_size,
+    #             HEAD_SIZE=head_size,
+    #             HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+    #             USE_ALIBI_SLOPES=use_alibi_slopes,
+    #             USE_SOFTCAP=(softcap > 0),
+    #             SLIDING_WINDOW=(1 + window_size[0]),
+    #             stride_k_cache_0=k.stride(0),
+    #             stride_k_cache_1=k.stride(1),
+    #             stride_k_cache_2=k.stride(2),
+    #             stride_k_cache_3=k.stride(3),
+    #             stride_v_cache_0=v.stride(0),
+    #             stride_v_cache_1=v.stride(1),
+    #             stride_v_cache_2=v.stride(2),
+    #             stride_v_cache_3=v.stride(3),
+    #             query_start_len_ptr=cu_seqlens_q,
+    #             BLOCK_Q=BLOCK_Q,
+    #             num_seqs=num_seqs,
+    #             BLOCK_M=BLOCK_M,
+    #             NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+    #         )
+
+    #     reduce_segments[(q.shape[0], num_query_heads)](
+    #         output_ptr=out,
+    #         segm_output_ptr=segm_output,
+    #         segm_max_ptr=segm_max,
+    #         segm_expsum_ptr=segm_expsum,
+    #         seq_lens_ptr=seqused_k,
+    #         num_seqs=num_seqs,
+    #         num_query_heads=num_query_heads,
+    #         output_stride_0=out.stride(0),
+    #         output_stride_1=out.stride(1),
+    #         block_table_stride=block_table.stride(0),
+    #         BLOCK_SIZE=block_size,
+    #         HEAD_SIZE=head_size,
+    #         HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+    #         query_start_len_ptr=cu_seqlens_q,
+    #         BLOCK_Q=BLOCK_Q,
+    #         NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+    #     )
diff --git a/scripts/bench_vllm_latency_range.py b/scripts/bench_vllm_latency_range.py
index e729e5cba..f711316f7 100644
--- a/scripts/bench_vllm_latency_range.py
+++ b/scripts/bench_vllm_latency_range.py
@@ -110,7 +110,7 @@ def create_dir_if_not_exist(path, mode=0o777):
     )
 print(zipped_lists)
 
-
+start_time = datetime.now()
 for bs, il, ol in zipped_lists:
     print(
         f"====== Measuring batch_size {bs}, input length {il}, output length {ol} ====="
@@ -134,5 +134,7 @@ def create_dir_if_not_exist(path, mode=0o777):
         print(f"benchmark command returned {rv}, stopping...")
         exit(rv)
 
+end_time = datetime.now()
 print(f"results stored in: {result_dir}")
 os.system(f"ls -alh {result_dir}")
+print(f"Benchmark time: {end_time-start_time}")
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index b4619bb94..618dfed9b 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -70,6 +70,7 @@ class Implementation(Enum):
     PYTORCH_NATIVE = 13
     TRITON_TUNED = 14
     TRITON_FALLBACK = 15
+    UNF_TRITON_2D_SIMPLE = 16
 
 
 class BenchmarkMode(Enum):
@@ -1041,14 +1042,15 @@ def test_prefix_vllm_v1_attention(
         Implementation.TRITON_2D,
         Implementation.UNF_TRITON_3D,
         Implementation.UNF_TRITON_2D,
+        Implementation.UNF_TRITON_2D_SIMPLE,
         Implementation.UNF_TRITON_AUTO,
     ]:
         pytest.skip()
 
     # TODO: Error: "Offset increment outside graph capture"
     #  for triton and flash_attn
-    if benchmark_mode == BenchmarkMode.CUDA_GRAPHS:
-        pytest.skip("not supported")
+    # if benchmark_mode == BenchmarkMode.CUDA_GRAPHS:
+    #     pytest.skip("not supported")
 
     # TODO
     # RTOL = 0
@@ -1293,6 +1295,8 @@ def test_prefix_vllm_v1_attention(
             from callers import UnifiedTriton3dAttentionCaller as Caller
         elif implementation == Implementation.UNF_TRITON_2D:
             from callers import UnifiedTriton2dAttentionCaller as Caller
+        elif implementation == Implementation.UNF_TRITON_2D_SIMPLE:
+            from callers import SimpleUnifiedTriton2dAttentionCaller as Caller
         elif implementation == Implementation.UNF_TRITON_AUTO:
             from callers import UnifiedTritonAutoAttentionCaller as Caller
 
diff --git a/scripts/callers/__init__.py b/scripts/callers/__init__.py
index 022e3a2e5..212944442 100644
--- a/scripts/callers/__init__.py
+++ b/scripts/callers/__init__.py
@@ -56,4 +56,5 @@
     UnifiedTriton2dAttentionCaller,
     UnifiedTriton3dAttentionCaller,
     UnifiedTritonAutoAttentionCaller,
+    SimpleUnifiedTriton2dAttentionCaller,
 )
diff --git a/scripts/callers/unified_triton.py b/scripts/callers/unified_triton.py
index c1f9b2c81..9dd00fe57 100644
--- a/scripts/callers/unified_triton.py
+++ b/scripts/callers/unified_triton.py
@@ -17,7 +17,7 @@
 
 import torch
 
-from ibm_triton_lib.kernels import unified_attention
+from ibm_triton_lib.kernels import unified_attention, unified_attention_simple
 from .base import PrefixPrefillCaller
 
 
@@ -126,6 +126,73 @@ def make_call_func(
         )
 
 
+class SimpleUnifiedTriton2dAttentionCaller(PrefixPrefillCaller):
+    @staticmethod
+    def make_call_func(
+        output,
+        query,
+        key_cache,
+        value_cache,
+        key,
+        value,
+        block_tables,
+        seq_lens,
+        ctx_lens,
+        query_lens,
+        start_loc,
+        seq_start_loc,
+        softmax_scale,
+        # kv_cache_dtype,  # unused
+        force_selection=2,
+    ):
+        """
+        query: shape = [num_tokens, num_heads, head_size]
+        key: shape = [num_tokens, num_kv_heads, head_size]
+        value: shape = [num_tokens, num_kv_heads, head_size]
+        k_cache = [num_blocks, block_size, num_kv_heads, head_size]
+        v_cache = [num_blocks, block_size, num_kv_heads, head_size]
+        Returns:
+            shape = [num_tokens, num_heads, head_size]
+        """
+        assert force_selection == 2, "simple unified kernel is only applicable to 2d"
+
+        max_query_len = query_lens.max()
+        max_seqlen = seq_lens.max()
+
+        avg_seqlen_q = query_lens.to(torch.float).mean()
+        avg_seqlen_k = seq_lens.to(torch.float).mean()
+
+        def call_and_process_output():
+            # k must have shape (num_blocks, page_block_size, num_heads_k, head_size)
+            return unified_attention_simple(
+                q=query,
+                k=key_cache,
+                v=value_cache,
+                out=output,
+                cu_seqlens_q=start_loc,
+                max_seqlen_q=max_query_len,
+                seqused_k=seq_lens,
+                max_seqlen_k=max_seqlen,
+                softmax_scale=softmax_scale,
+                causal=True,
+                window_size=(-1, -1),
+                block_table=block_tables,
+                softcap=0,
+                q_descale=None,
+                k_descale=None,  # TODO?
+                v_descale=None,  # TODO?
+                alibi_slopes=None,
+                # avg_seqlen_q=avg_seqlen_q,
+                # avg_seqlen_k=avg_seqlen_k,
+            )
+
+        return call_and_process_output
+
+    @staticmethod
+    def requires_allocated_output() -> bool:
+        return True
+
+
 class UnifiedTritonAutoAttentionCaller(UnifiedTriton3dAttentionCaller):
     @staticmethod
     def make_call_func(
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
new file mode 100644
index 000000000..840e5980b
--- /dev/null
+++ b/scripts/requirements.txt
@@ -0,0 +1,2 @@
+llnl-hatchet==2025.1.0
+pytest==8.4.1
diff --git a/scripts/setups/prefix_tune_2d.conf b/scripts/setups/prefix_tune_2d.conf
index 5987fd28e..6e587247e 100644
--- a/scripts/setups/prefix_tune_2d.conf
+++ b/scripts/setups/prefix_tune_2d.conf
@@ -25,10 +25,12 @@ PROMPT_PATTERNS = [[1.0], [0.1, 0.4, 0.5, 1.0, 0.2]]
 MAX_VALUES = [1.0]
 BENCHMARK_MODES = ["CUDA_EVENTS"]
 
-IMPLEMENTATION_UT = ["UNF_TRITON_2D"]
+# IMPLEMENTATION_UT = ["UNF_TRITON_2D"]
+IMPLEMENTATION_UT = ["UNF_TRITON_2D_SIMPLE"]
 # IMPLEMENTATION_UT = ["FLASH_ATTN", "UNF_TRITON_2D"]
 
 # TRITON_BACKEND_DEBUG = 1
 # STORE_TEST_RESULT_PATH=/results
+STORE_TEST_RESULT_PATH=/home/zrlngl/watsonx/zrl-triton-results-and-notebooks/micro_benchmarks/raw_data/
 
 # TEST_ALLOW_INCORRECT = 1

From 322eae97e1d122da1515787a948aecd961c70347 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Tue, 29 Jul 2025 10:21:16 +0000
Subject: [PATCH 22/61] making path relative

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/benchmark.py               | 2 +-
 scripts/setups/prefix_tune_2d.conf | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 618dfed9b..8faf2fd91 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -2041,7 +2041,7 @@ def write_df_and_chmod(df, filename, mode=0o777):
     timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 
     if STORE_TEST_RESULT_PATH is not None:
-        gpu_path = os.path.join(STORE_TEST_RESULT_PATH, gpu_name)
+        gpu_path = os.path.join(os.path.abspath(STORE_TEST_RESULT_PATH), gpu_name)
         gloabl_pd_file_prefix = os.path.join(gpu_path, timestamp)
         create_dir_if_not_exist_recursive(gloabl_pd_file_prefix)
     else:
diff --git a/scripts/setups/prefix_tune_2d.conf b/scripts/setups/prefix_tune_2d.conf
index 6e587247e..c0b3d03f6 100644
--- a/scripts/setups/prefix_tune_2d.conf
+++ b/scripts/setups/prefix_tune_2d.conf
@@ -31,6 +31,6 @@ IMPLEMENTATION_UT = ["UNF_TRITON_2D_SIMPLE"]
 
 # TRITON_BACKEND_DEBUG = 1
 # STORE_TEST_RESULT_PATH=/results
-STORE_TEST_RESULT_PATH=/home/zrlngl/watsonx/zrl-triton-results-and-notebooks/micro_benchmarks/raw_data/
+STORE_TEST_RESULT_PATH=./zrl-triton-results-and-notebooks/micro_benchmarks/raw_data/
 
 # TEST_ALLOW_INCORRECT = 1

From bbce518c8a42d1bdd266ef35f631a31515cfd0cd Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Tue, 29 Jul 2025 13:58:46 +0000
Subject: [PATCH 23/61] also looking at decode

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/setups/prefix_tune_2d.conf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/setups/prefix_tune_2d.conf b/scripts/setups/prefix_tune_2d.conf
index c0b3d03f6..1e89c3397 100644
--- a/scripts/setups/prefix_tune_2d.conf
+++ b/scripts/setups/prefix_tune_2d.conf
@@ -5,8 +5,8 @@ NUM_HEADS = [[32, 8]]
 
 SEQUENCE_LENGTHS = [16, 32, 64, 128, 512, 1024, 2048, 4096]
 # SEQUENCE_LENGTHS = [64]
-# PREFIX_PREFILL_SHARE_OF_DECODE = [0.0, 0.5, 1.0]
-PREFIX_PREFILL_SHARE_OF_DECODE = [0.0, 0.5]
+PREFIX_PREFILL_SHARE_OF_DECODE = [0.0, 0.5, 1.0]
+# PREFIX_PREFILL_SHARE_OF_DECODE = [0.0, 0.5]
 # PREFIX_PREFILL_SHARE_OF_DECODE = [0.5]
 PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0, 0.5]
 # PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.5]

From cc9327d73b7e536b628bf512fdbb46bcda244bec Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Tue, 29 Jul 2025 11:18:59 -0400
Subject: [PATCH 24/61] Adding kernels with flexible tiles

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
Co-authored-by: Jan Van Lunteren <jvl@zurich.ibm.com>
---
 .../ibm_triton_lib/kernels/__init__.py        |   1 +
 .../kernels/triton_unified_newtiles.py        | 772 ++++++++++++++++++
 scripts/benchmark.py                          |  12 +
 scripts/callers/__init__.py                   |   5 +
 scripts/callers/unified_triton_newtiles.py    | 164 ++++
 scripts/setups/prefix_tune_2d.conf            |   4 +-
 6 files changed, 957 insertions(+), 1 deletion(-)
 create mode 100644 ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_newtiles.py
 create mode 100644 scripts/callers/unified_triton_newtiles.py

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py b/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
index 9b28ea4d2..1471acd33 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
@@ -68,6 +68,7 @@ def ConfigSpace(
 
 from .triton_unified_attention import unified_attention
 from .triton_unified_attention_simple import unified_attention as unified_attention_simple
+from .triton_unified_newtiles import unified_attention as unified_attention_newtiles
 
 from .mamba_ssm import selective_state_update
 
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_newtiles.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_newtiles.py
new file mode 100644
index 000000000..30d80ad34
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_newtiles.py
@@ -0,0 +1,772 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Authors:
+#  - Burkhard Ringlein <ngl@zurich.ibm.com>
+#  - Jan van Lunteren <jvl@zurich.ibm.com>
+#  - Chih-Chieh Yang <chih.chieh.yang@ibm.com>
+#  - Thomas Parnell <tpa@zurich.ibm.com>
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.triton_utils import tl, triton
+
+logger = init_logger(__name__)
+
+
+@triton.jit
+def cdiv_fn(x, y):
+    return (x + y - 1) // y
+
+
+@triton.jit
+def apply_softcap(S, x):
+    Sdiv = S / x
+    p1 = tl.exp(Sdiv)
+    p2 = tl.exp(-Sdiv)
+    return x * (p1 - p2) / (p1 + p2)
+
+
+@triton.jit
+def find_seq_idx(query_start_len_ptr, target_idx, num_seqs,
+                 BLOCK_Q: tl.constexpr, use_q_block_mode: tl.constexpr):
+    left: tl.int32 = 0
+    right = num_seqs
+    while left < right:
+        mid = (left + right) // 2
+        val = tl.load(query_start_len_ptr + mid)
+        mid_val = val // BLOCK_Q + mid if use_q_block_mode else val
+
+        if mid_val <= target_idx:
+            left = mid + 1
+        else:
+            right = mid
+
+    return left - 1
+
+
+@triton.jit
+def kernel_unified_attention_2d(
+        output_ptr,  # [num_tokens, num_query_heads, head_size]
+        query_ptr,  # [num_tokens, num_query_heads, head_size]
+        key_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+        value_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+        seq_lens_ptr,  # [num_seqs]
+        alibi_slopes_ptr,  # [num_query_heads]
+        scale,  # float32
+        k_scale,  # float32
+        v_scale,  # float32
+        softcap,  # float32
+        num_query_heads: tl.constexpr,  # int
+        num_queries_per_kv: tl.constexpr,  # int
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
+        BLOCK_SIZE: tl.constexpr,  # int
+        TILE_SIZE: tl.constexpr,  # int must be power of 2
+        HEAD_SIZE: tl.constexpr,  # int
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        USE_SOFTCAP: tl.constexpr,  # bool
+        SLIDING_WINDOW: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.constexpr,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.constexpr,  # int
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        num_seqs: tl.int32,
+        BLOCK_M: tl.constexpr,  # int
+):
+    q_block_global_idx = tl.program_id(0)
+    kv_head_idx = tl.program_id(1)
+
+    seq_idx = find_seq_idx(query_start_len_ptr, q_block_global_idx, num_seqs,
+                           BLOCK_Q, True)
+
+    q_block_start_idx = tl.load(query_start_len_ptr +
+                                seq_idx) // BLOCK_Q + seq_idx
+
+    q_block_local_idx = q_block_global_idx - q_block_start_idx
+
+    cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
+    cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1)
+
+    cur_batch_query_len = cur_batch_in_all_stop_index \
+        - cur_batch_in_all_start_index
+
+    if q_block_local_idx * BLOCK_Q >= cur_batch_query_len:
+        return
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, HEAD_SIZE_PADDED)
+    offs_t = tl.arange(0, TILE_SIZE)
+    query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
+
+    query_offset_0 = cur_batch_in_all_start_index + query_pos
+    query_offset_1 = kv_head_idx * num_queries_per_kv + \
+        offs_m % num_queries_per_kv
+    query_offset = (query_offset_0[:, None] * query_stride_0 +
+                    query_offset_1[:, None] * query_stride_1 + offs_d[None, :])
+
+    dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1)
+    query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1)
+    query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1)
+
+    # Q : (BLOCK_M, HEAD_SIZE_PADDED)
+    Q = tl.load(
+        query_ptr + query_offset,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+        other=0.0,
+    )
+
+    block_table_offset = seq_idx * block_table_stride
+
+    M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    L = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32)
+
+    # sequence len for this particular sequence
+    seq_len = tl.load(seq_lens_ptr + seq_idx)
+
+    # context length for this particular sequences
+    context_len = seq_len - cur_batch_query_len
+
+    # alibi slope for this head
+    if USE_ALIBI_SLOPES:
+        alibi_slope = tl.load(alibi_slopes_ptr + query_offset_1,
+                              mask=query_mask_1,
+                              other=0.0)
+
+    # compute the length of the longest sequence prefix spanned by any
+    # query token in the current q_block (q_block_local_idx)
+    max_seq_prefix_len = context_len + q_block_local_idx * BLOCK_Q + (
+        BLOCK_M - 1) // num_queries_per_kv + 1
+
+    # adjust for potential padding in the last q_block by considering the
+    # actual sequence length
+    max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
+
+    # calculate the number of tiles that need to be processed to
+    # cover the longest sequence prefix (due to causal masking, tiles beyond
+    # this prefix can be skipped)
+    num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE)
+
+    # iterate through tiles
+    for j in range(0, num_tiles):
+        seq_offset = j * TILE_SIZE + offs_t
+        tile_mask = seq_offset < max_seq_prefix_len
+
+        physical_block_idx = tl.load(block_tables_ptr + block_table_offset +
+                                     seq_offset // BLOCK_SIZE).to(tl.int64)
+
+        v_offset = (physical_block_idx[:, None] * stride_v_cache_0 +
+                    kv_head_idx * stride_v_cache_2 +
+                    offs_d[None, :] * stride_v_cache_3 +
+                    (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1)
+
+        k_offset = (physical_block_idx[None, :] * stride_k_cache_0 +
+                    kv_head_idx * stride_k_cache_2 +
+                    offs_d[:, None] * stride_k_cache_3 +
+                    (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1)
+
+        # K : (HEAD_SIZE, TILE_SIZE)
+        K_load = tl.load(key_cache_ptr + k_offset,
+                         mask=dim_mask[:, None] & tile_mask[None, :],
+                         other=0.0)
+
+        if K_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                K = K_load
+            else:
+                K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype)
+        else:
+            K = K_load
+
+        # V : (TILE_SIZE, HEAD_SIZE)
+        V_load = tl.load(value_cache_ptr + v_offset,
+                         mask=dim_mask[None, :] & tile_mask[:, None],
+                         other=0.0)
+
+        if V_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                V = V_load
+            else:
+                V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype)
+        else:
+            V = V_load
+
+        seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
+
+        # S : (BLOCK_M, TILE_SIZE)
+        S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32)
+
+        S += scale * tl.dot(Q, K)
+
+        if USE_SOFTCAP:
+            S = apply_softcap(S, softcap)
+
+        S = tl.where(query_mask_1[:, None] & query_mask_0[:, None] & seq_mask,
+                     S, float("-inf"))
+
+        if SLIDING_WINDOW > 0:
+            S = tl.where((context_len + query_pos[:, None] - seq_offset)
+                         < SLIDING_WINDOW, S, float("-inf"))
+
+        if USE_ALIBI_SLOPES:
+            S += alibi_slope[:, None] * (seq_offset - context_len)
+
+        # compute running maximum
+        # m_j : (BLOCK_M,)
+        m_j = tl.maximum(M, tl.max(S, axis=1))
+
+        # For sliding window there's a chance the max is -inf due to masking of
+        # the entire row. In this case we need to set m_j 0 to avoid NaN
+        m_j = tl.where(m_j > float("-inf"), m_j, 0.0)
+
+        # P : (BLOCK_M, TILE_SIZE)
+        P = tl.exp(S - m_j[:, None])
+
+        # l_j : (BLOCK_M,)
+        l_j = tl.sum(P, axis=1)
+
+        # alpha : (BLOCK_M, )
+        alpha = tl.exp(M - m_j)
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc = acc * alpha[:, None]
+
+        # update constants
+        L = L * alpha + l_j
+        M = m_j
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc += tl.dot(P.to(V.dtype), V)
+
+    # epilogue
+    acc = acc / L[:, None]
+
+    output_offset = (query_offset_0[:, None] * output_stride_0 +
+                     query_offset_1[:, None] * output_stride_1 +
+                     offs_d[None, :])
+
+    tl.store(
+        output_ptr + output_offset,
+        acc,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+    )
+
+
+@triton.jit
+def kernel_unified_attention_3d(
+        segm_output_ptr,
+        # [num_tokens, num_query_heads, num_segments, head_size]
+        segm_max_ptr,  # [num_tokens, num_query_heads, num_segments]
+        segm_expsum_ptr,  # [num_tokens, num_query_heads, num_segments]
+        query_ptr,  # [num_tokens, num_query_heads, head_size]
+        key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
+        value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
+        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+        seq_lens_ptr,  # [num_seqs]
+        alibi_slopes_ptr,  # [num_query_heads]
+        scale,  # float32
+        k_scale,  # float32
+        v_scale,  # float32
+        softcap,  # float32
+        num_query_heads: tl.constexpr,  # int
+        num_queries_per_kv: tl.constexpr,  # int
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        BLOCK_SIZE: tl.constexpr,  # int
+        TILE_SIZE: tl.constexpr,  # int must be power of 2
+        HEAD_SIZE: tl.constexpr,  # int
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        USE_SOFTCAP: tl.constexpr,  # bool
+        SLIDING_WINDOW: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.constexpr,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.constexpr,  # int
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        num_seqs: tl.int32,
+        BLOCK_M: tl.constexpr,  # int
+        NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+):
+    q_block_global_idx = tl.program_id(0)
+    kv_head_idx = tl.program_id(1)
+    segm_idx = tl.program_id(2)
+
+    seq_idx = find_seq_idx(query_start_len_ptr, q_block_global_idx, num_seqs,
+                           BLOCK_Q, True)
+
+    q_block_start_idx = tl.load(query_start_len_ptr +
+                                seq_idx) // BLOCK_Q + seq_idx
+
+    q_block_local_idx = q_block_global_idx - q_block_start_idx
+
+    cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
+    cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1)
+
+    cur_batch_query_len = cur_batch_in_all_stop_index \
+        - cur_batch_in_all_start_index
+
+    if q_block_local_idx * BLOCK_Q >= cur_batch_query_len:
+        return
+
+    # sequence len for this particular sequence
+    seq_len = tl.load(seq_lens_ptr + seq_idx)
+
+    # number of segments for this particular sequence
+    num_segments = NUM_SEGMENTS_PER_SEQ
+    tiles_per_segment = cdiv_fn(seq_len, num_segments * TILE_SIZE)
+
+    if segm_idx * tiles_per_segment * TILE_SIZE >= seq_len:
+        return
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, HEAD_SIZE_PADDED)
+    offs_t = tl.arange(0, TILE_SIZE)
+    query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
+
+    query_offset_0 = cur_batch_in_all_start_index + query_pos
+    query_offset_1 = kv_head_idx * num_queries_per_kv + \
+        offs_m % num_queries_per_kv
+    query_offset = (query_offset_0[:, None] * query_stride_0 +
+                    query_offset_1[:, None] * query_stride_1 + offs_d[None, :])
+
+    dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1)
+    query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1)
+    query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1)
+
+    # Q : (BLOCK_M, HEAD_SIZE_PADDED)
+    Q = tl.load(
+        query_ptr + query_offset,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+        other=0.0,
+    )
+
+    block_table_offset = seq_idx * block_table_stride
+
+    M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    L = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32)
+
+    # context length for this particular sequences
+    context_len = seq_len - cur_batch_query_len
+
+    # alibi slope for this head
+    if USE_ALIBI_SLOPES:
+        alibi_slope = tl.load(alibi_slopes_ptr + query_offset_1,
+                              mask=query_mask_1,
+                              other=0.0)
+
+    # compute the length of the longest sequence prefix spanned by any
+    # query token in the current q_block (q_block_local_idx)
+    max_seq_prefix_len = context_len + q_block_local_idx * BLOCK_Q + (
+        BLOCK_M - 1) // num_queries_per_kv + 1
+
+    # adjust for potential padding in the last q_block by considering the
+    # actual sequence length
+    max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
+
+    # calculate the number of tiles that need to be processed to
+    # cover the longest sequence prefix (due to causal masking, tiles beyond
+    # this prefix can be skipped)
+    num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE)
+
+    # iterate through tiles within current segment
+    for j in range(
+            segm_idx * tiles_per_segment,
+            min((segm_idx + 1) * tiles_per_segment, num_tiles),
+    ):
+        seq_offset = j * TILE_SIZE + offs_t
+        tile_mask = seq_offset <  max_seq_prefix_len
+
+        physical_block_idx = tl.load(block_tables_ptr + block_table_offset +
+                                     seq_offset // BLOCK_SIZE).to(tl.int64)
+
+        v_offset = (physical_block_idx[:, None] * stride_v_cache_0 +
+                    kv_head_idx * stride_v_cache_2 +
+                    offs_d[None, :] * stride_v_cache_3 +
+                    (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1)
+
+        k_offset = (physical_block_idx[None, :] * stride_k_cache_0 +
+                    kv_head_idx * stride_k_cache_2 +
+                    offs_d[:, None] * stride_k_cache_3 +
+                    (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1)
+
+        # K : (HEAD_SIZE, TILE_SIZE)
+        K_load = tl.load(key_cache_ptr + k_offset,
+                         mask=dim_mask[:, None] & tile_mask[None, :],
+                         other=0.0)
+
+        if K_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                K = K_load
+            else:
+                K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype)
+        else:
+            K = K_load
+
+        # V : (TILE_SIZE, HEAD_SIZE)
+        V_load = tl.load(value_cache_ptr + v_offset,
+                         mask=dim_mask[None, :] & tile_mask[:, None],
+                         other=0.0)
+
+        if V_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                V = V_load
+            else:
+                V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype)
+        else:
+            V = V_load
+
+        seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
+
+        # S : (BLOCK_M, TILE_SIZE)
+        S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32)
+        S += scale * tl.dot(Q, K)
+
+        if USE_SOFTCAP:
+            S = apply_softcap(S, softcap)
+
+        S = tl.where(query_mask_1[:, None] & query_mask_0[:, None] & seq_mask,
+                     S, float("-inf"))
+
+        if SLIDING_WINDOW > 0:
+            S = tl.where((context_len + query_pos[:, None] - seq_offset)
+                         < SLIDING_WINDOW, S, float("-inf"))
+
+        if USE_ALIBI_SLOPES:
+            S += alibi_slope[:, None] * (seq_offset - context_len)
+
+        # compute running maximum
+        # m_j : (BLOCK_M,)
+        m_j = tl.maximum(M, tl.max(S, axis=1))
+
+        # For sliding window there's a chance the max is -inf due to masking of
+        # the entire row. In this case we need to set m_j 0 to avoid NaN
+        m_j = tl.where(m_j > float("-inf"), m_j, 0.0)
+
+        # P : (BLOCK_M, TILE_SIZE,)
+        P = tl.exp(S - m_j[:, None])
+
+        # l_j : (BLOCK_M,)
+        l_j = tl.sum(P, axis=1)
+
+        # alpha : (BLOCK_M, )
+        alpha = tl.exp(M - m_j)
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc = acc * alpha[:, None]
+
+        # update constants
+        L = L * alpha + l_j
+        M = m_j
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc += tl.dot(P.to(V.dtype), V)
+
+        #if kv_head_idx == 0:
+        #    print(f"\nq_block_global_idx={q_block_global_idx}  segm_idx={segm_idx}  j={j} : L={L}  M={M}\n") # acc={acc}\n")
+
+    segm_output_offset = (
+        query_offset_0[:, None].to(tl.int64) *
+        (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        query_offset_1[:, None] * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        segm_idx * HEAD_SIZE_PADDED + tl.arange(0, HEAD_SIZE_PADDED)[None, :])
+    tl.store(
+        segm_output_ptr + segm_output_offset,
+        acc,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+    )
+    segm_offset = (query_offset_0.to(tl.int64) *
+                   (num_query_heads * NUM_SEGMENTS_PER_SEQ) +
+                   query_offset_1 * NUM_SEGMENTS_PER_SEQ + segm_idx)
+    tl.store(segm_max_ptr + segm_offset, M, mask=query_mask_0 & query_mask_1)
+    tl.store(segm_expsum_ptr + segm_offset,
+             L,
+             mask=query_mask_0 & query_mask_1)
+
+
+@triton.jit
+def reduce_segments(
+        output_ptr,  # [num_tokens, num_query_heads, head_size]
+        segm_output_ptr,
+        #[num_tokens, num_query_heads, max_num_segments, head_size]
+        segm_max_ptr,  # [num_tokens, num_query_heads, max_num_segments]
+        segm_expsum_ptr,  # [num_tokens, num_query_heads, max_num_segments]
+        seq_lens_ptr,  # [num_seqs]
+        num_seqs,  # int
+        num_query_heads: tl.constexpr,  # int
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
+        block_table_stride: tl.int64,  # int
+        TILE_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int, must be power of 2
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+):
+    query_token_idx = tl.program_id(0)
+    query_head_idx = tl.program_id(1)
+
+    seq_idx = find_seq_idx(query_start_len_ptr, query_token_idx, num_seqs,
+                           BLOCK_Q, False)
+
+    # sequence len for this particular sequence
+    seq_len = tl.load(seq_lens_ptr + seq_idx)
+
+    # number of segments for this particular sequence
+    num_segments = NUM_SEGMENTS_PER_SEQ
+    tiles_per_segment = cdiv_fn(seq_len, num_segments * TILE_SIZE)
+
+    # create masks for subsequent loads
+    act_num_segments = cdiv_fn(seq_len, tiles_per_segment * TILE_SIZE)
+    segm_mask = tl.arange(0, NUM_SEGMENTS_PER_SEQ) < tl.full(
+        [NUM_SEGMENTS_PER_SEQ], act_num_segments, dtype=tl.int32)
+    dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1,
+                        0).to(tl.int1)
+
+    # load segment maxima
+    segm_offset = (query_token_idx.to(tl.int64) *
+                   (num_query_heads * NUM_SEGMENTS_PER_SEQ) +
+                   query_head_idx * NUM_SEGMENTS_PER_SEQ +
+                   tl.arange(0, NUM_SEGMENTS_PER_SEQ))
+    segm_max = tl.load(segm_max_ptr + segm_offset,
+                       mask=segm_mask,
+                       other=float("-inf"))
+    overall_max = tl.max(segm_max)
+
+    # load and rescale segment exp sums
+    segm_expsum = tl.load(segm_expsum_ptr + segm_offset,
+                          mask=segm_mask,
+                          other=0.0)
+    segm_expsum = segm_expsum * tl.exp(segm_max - overall_max)
+    overall_expsum = tl.sum(segm_expsum)
+
+    # load, rescale, and add segment attention outputs
+    segm_output_offset = (
+        query_token_idx.to(tl.int64) *
+        (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        query_head_idx * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        tl.arange(0, NUM_SEGMENTS_PER_SEQ)[:, None] * HEAD_SIZE_PADDED +
+        tl.arange(0, HEAD_SIZE_PADDED)[None, :])
+    segm_output = tl.load(
+        segm_output_ptr + segm_output_offset,
+        mask=segm_mask[:, None] & dim_mask[None, :],
+        other=0.0,
+    )
+    segm_output *= tl.exp(segm_max - overall_max)[:, None]
+    acc_sum = tl.sum(segm_output, axis=0)
+    # safely divide by overall_expsum, returning 0.0 if overall_expsum is 0
+    acc = tl.where(overall_expsum == 0.0, 0.0, acc_sum / overall_expsum)
+
+    # write result
+    output_offset = (query_token_idx * output_stride_0 +
+                     query_head_idx * output_stride_1 +
+                     tl.arange(0, HEAD_SIZE_PADDED))
+    tl.store(output_ptr + output_offset, acc, mask=dim_mask)
+
+
+def unified_attention(
+    q,
+    k,
+    v,
+    out,
+    cu_seqlens_q,
+    max_seqlen_q,
+    seqused_k,
+    max_seqlen_k,
+    softmax_scale,
+    causal,
+    window_size,
+    block_table,
+    softcap,
+    q_descale,
+    k_descale,
+    v_descale,
+    alibi_slopes=None,
+    force_selection=None,  # None, 2, 3 to select kernel
+):
+
+    assert causal, "Only causal attention is supported"
+    assert q_descale is None, "Q scales not supported"
+
+    block_size = v.shape[1]
+    assert q.element_size() >= 2 or block_size >= 32, \
+        "Block size must be at least 32 for fp8"
+
+    use_alibi_slopes = alibi_slopes is not None
+
+    block_size = v.shape[1]
+    num_seqs = len(seqused_k)
+    num_query_heads = q.shape[1]
+    num_kv_heads = k.shape[2]
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    head_size = q.shape[2]
+
+    BLOCK_M = 16
+    BLOCK_Q = BLOCK_M // num_queries_per_kv
+
+    # Ideally we would launch with kernel with:
+    # \sum_i[ceil(query_len[i] / BLOCK_Q)] blocks.
+    # However, it is slow to realize the query_lens on cpu.
+    # Instead we use upper-bound:
+    # \sum_i[ceil(query_len[i] / BLOCK_Q)]
+    #   <= \sum_i[floor(query_len[i] / BLOCK_Q) + 1]
+    #    = \sum_i[floor(query_len[i] / BLOCK_Q)] + num_seqs
+    #   <= floor(\sum_i(query_len[i]) / BLOCK_Q) + num_seqs
+    #    = floor(q.shape[0] / BLOCK_Q) + num_seqs
+    total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs
+
+    TILE_SIZE_PREFILL = 32
+    TILE_SIZE_DECODE = 32
+
+    # if batch contains a prefill
+    if (max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128) or force_selection == 2 and force_selection != 3:
+        kernel_unified_attention_2d[(
+            total_num_q_blocks,
+            num_kv_heads,
+        )](
+            output_ptr=out,
+            query_ptr=q,
+            key_cache_ptr=k,
+            value_cache_ptr=v,
+            block_tables_ptr=block_table,
+            seq_lens_ptr=seqused_k,
+            alibi_slopes_ptr=alibi_slopes,
+            scale=softmax_scale,
+            k_scale=k_descale,
+            v_scale=v_descale,
+            softcap=softcap,
+            num_query_heads=num_query_heads,
+            num_queries_per_kv=num_queries_per_kv,
+            block_table_stride=block_table.stride(0),
+            query_stride_0=q.stride(0),
+            query_stride_1=q.stride(1),
+            output_stride_0=out.stride(0),
+            output_stride_1=out.stride(1),
+            BLOCK_SIZE=block_size,
+            TILE_SIZE=TILE_SIZE_PREFILL,
+            HEAD_SIZE=head_size,
+            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+            USE_ALIBI_SLOPES=use_alibi_slopes,
+            USE_SOFTCAP=(softcap > 0),
+            SLIDING_WINDOW=(1 + window_size[0]),
+            stride_k_cache_0=k.stride(0),
+            stride_k_cache_1=k.stride(1),
+            stride_k_cache_2=k.stride(2),
+            stride_k_cache_3=k.stride(3),
+            stride_v_cache_0=v.stride(0),
+            stride_v_cache_1=v.stride(1),
+            stride_v_cache_2=v.stride(2),
+            stride_v_cache_3=v.stride(3),
+            query_start_len_ptr=cu_seqlens_q,
+            BLOCK_Q=BLOCK_Q,
+            num_seqs=num_seqs,
+            BLOCK_M=BLOCK_M,
+        )
+    else:
+        # for initial version, NUM_SEGMENTS = 16 is chosen as a default
+        # value that showed good performance in tests
+        NUM_SEGMENTS = 16
+
+        segm_output = torch.empty(
+            q.shape[0],
+            num_query_heads,
+            NUM_SEGMENTS,
+            triton.next_power_of_2(head_size),
+            dtype=torch.float32,
+            device=q.device,
+        )
+        segm_max = torch.empty(
+            q.shape[0],
+            num_query_heads,
+            NUM_SEGMENTS,
+            dtype=torch.float32,
+            device=q.device,
+        )
+        segm_expsum = torch.empty(
+            q.shape[0],
+            num_query_heads,
+            NUM_SEGMENTS,
+            dtype=torch.float32,
+            device=q.device,
+        )
+
+        kernel_unified_attention_3d[(
+            total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)](
+                segm_output_ptr=segm_output,
+                segm_max_ptr=segm_max,
+                segm_expsum_ptr=segm_expsum,
+                query_ptr=q,
+                key_cache_ptr=k,
+                value_cache_ptr=v,
+                block_tables_ptr=block_table,
+                seq_lens_ptr=seqused_k,
+                alibi_slopes_ptr=alibi_slopes,
+                scale=softmax_scale,
+                k_scale=k_descale,
+                v_scale=v_descale,
+                softcap=softcap,
+                num_query_heads=num_query_heads,
+                num_queries_per_kv=num_queries_per_kv,
+                block_table_stride=block_table.stride(0),
+                query_stride_0=q.stride(0),
+                query_stride_1=q.stride(1),
+                BLOCK_SIZE=block_size,
+                TILE_SIZE=TILE_SIZE_DECODE,
+                HEAD_SIZE=head_size,
+                HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+                USE_ALIBI_SLOPES=use_alibi_slopes,
+                USE_SOFTCAP=(softcap > 0),
+                SLIDING_WINDOW=(1 + window_size[0]),
+                stride_k_cache_0=k.stride(0),
+                stride_k_cache_1=k.stride(1),
+                stride_k_cache_2=k.stride(2),
+                stride_k_cache_3=k.stride(3),
+                stride_v_cache_0=v.stride(0),
+                stride_v_cache_1=v.stride(1),
+                stride_v_cache_2=v.stride(2),
+                stride_v_cache_3=v.stride(3),
+                query_start_len_ptr=cu_seqlens_q,
+                BLOCK_Q=BLOCK_Q,
+                num_seqs=num_seqs,
+                BLOCK_M=BLOCK_M,
+                NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+        )
+        reduce_segments[(q.shape[0], num_query_heads)](
+            output_ptr=out,
+            segm_output_ptr=segm_output,
+            segm_max_ptr=segm_max,
+            segm_expsum_ptr=segm_expsum,
+            seq_lens_ptr=seqused_k,
+            num_seqs=num_seqs,
+            num_query_heads=num_query_heads,
+            output_stride_0=out.stride(0),
+            output_stride_1=out.stride(1),
+            block_table_stride=block_table.stride(0),
+            TILE_SIZE=TILE_SIZE_DECODE,
+            HEAD_SIZE=head_size,
+            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+            query_start_len_ptr=cu_seqlens_q,
+            BLOCK_Q=BLOCK_Q,
+            NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+        )
\ No newline at end of file
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 8faf2fd91..d34cc0039 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -71,6 +71,9 @@ class Implementation(Enum):
     TRITON_TUNED = 14
     TRITON_FALLBACK = 15
     UNF_TRITON_2D_SIMPLE = 16
+    NT_UNF_TRITON_3D = 17
+    NT_UNF_TRITON_2D = 18
+    NT_UNF_TRITON_AUTO = 19
 
 
 class BenchmarkMode(Enum):
@@ -1044,6 +1047,9 @@ def test_prefix_vllm_v1_attention(
         Implementation.UNF_TRITON_2D,
         Implementation.UNF_TRITON_2D_SIMPLE,
         Implementation.UNF_TRITON_AUTO,
+        Implementation.NT_UNF_TRITON_3D,
+        Implementation.NT_UNF_TRITON_2D,
+        Implementation.NT_UNF_TRITON_AUTO,
     ]:
         pytest.skip()
 
@@ -1299,6 +1305,12 @@ def test_prefix_vllm_v1_attention(
             from callers import SimpleUnifiedTriton2dAttentionCaller as Caller
         elif implementation == Implementation.UNF_TRITON_AUTO:
             from callers import UnifiedTritonAutoAttentionCaller as Caller
+        elif implementation == Implementation.NT_UNF_TRITON_3D:
+            from callers import NewTilesUnifiedTriton3dAttentionCaller as Caller
+        elif implementation == Implementation.NT_UNF_TRITON_2D:
+            from callers import NewTilesUnifiedTriton2dAttentionCaller as Caller
+        elif implementation == Implementation.NT_UNF_TRITON_AUTO:
+            from callers import NewTilesUnifiedTritonAutoAttentionCaller as Caller
 
         if Caller.requires_allocated_output:
             output = torch.empty_like(query)
diff --git a/scripts/callers/__init__.py b/scripts/callers/__init__.py
index 212944442..ad51a88e2 100644
--- a/scripts/callers/__init__.py
+++ b/scripts/callers/__init__.py
@@ -58,3 +58,8 @@
     UnifiedTritonAutoAttentionCaller,
     SimpleUnifiedTriton2dAttentionCaller,
 )
+from .unified_triton_newtiles import (
+    NewTilesUnifiedTriton2dAttentionCaller,
+    NewTilesUnifiedTriton3dAttentionCaller,
+    NewTilesUnifiedTritonAutoAttentionCaller,
+)
diff --git a/scripts/callers/unified_triton_newtiles.py b/scripts/callers/unified_triton_newtiles.py
new file mode 100644
index 000000000..d3d751a63
--- /dev/null
+++ b/scripts/callers/unified_triton_newtiles.py
@@ -0,0 +1,164 @@
+#  /*******************************************************************************
+#   * Copyright 2025 IBM Corporation
+#   *
+#   * Licensed under the Apache License, Version 2.0 (the "License");
+#   * you may not use this file except in compliance with the License.
+#   * You may obtain a copy of the License at
+#   *
+#   *     http://www.apache.org/licenses/LICENSE-2.0
+#   *
+#   * Unless required by applicable law or agreed to in writing, software
+#   * distributed under the License is distributed on an "AS IS" BASIS,
+#   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   * See the License for the specific language governing permissions and
+#   * limitations under the License.
+#  *******************************************************************************/
+#
+
+import torch
+
+from ibm_triton_lib.kernels import unified_attention_newtiles
+from .base import PrefixPrefillCaller
+
+
+class NewTilesUnifiedTriton3dAttentionCaller(PrefixPrefillCaller):
+    @staticmethod
+    def make_call_func(
+        output,
+        query,
+        key_cache,
+        value_cache,
+        key,
+        value,
+        block_tables,
+        seq_lens,
+        ctx_lens,
+        query_lens,
+        start_loc,
+        seq_start_loc,
+        softmax_scale,
+        # kv_cache_dtype,  # unused
+        force_selection=3,
+    ):
+        """
+        query: shape = [num_tokens, num_heads, head_size]
+        key: shape = [num_tokens, num_kv_heads, head_size]
+        value: shape = [num_tokens, num_kv_heads, head_size]
+        k_cache = [num_blocks, block_size, num_kv_heads, head_size]
+        v_cache = [num_blocks, block_size, num_kv_heads, head_size]
+        Returns:
+            shape = [num_tokens, num_heads, head_size]
+        """
+
+        max_query_len = query_lens.max()
+        max_seqlen = seq_lens.max()
+
+        avg_seqlen_q = query_lens.to(torch.float).mean()
+        avg_seqlen_k = seq_lens.to(torch.float).mean()
+
+        def call_and_process_output():
+            # k must have shape (num_blocks, page_block_size, num_heads_k, head_size)
+            return unified_attention_newtiles(
+                q=query,
+                k=key_cache,
+                v=value_cache,
+                out=output,
+                cu_seqlens_q=start_loc,
+                max_seqlen_q=max_query_len,
+                seqused_k=seq_lens,
+                max_seqlen_k=max_seqlen,
+                softmax_scale=softmax_scale,
+                causal=True,
+                window_size=(-1, -1),
+                block_table=block_tables,
+                softcap=0,
+                q_descale=None,
+                k_descale=None,  # TODO?
+                v_descale=None,  # TODO?
+                alibi_slopes=None,
+                # avg_seqlen_q=avg_seqlen_q,
+                # avg_seqlen_k=avg_seqlen_k,
+                force_selection=force_selection,
+            )
+
+        return call_and_process_output
+
+    @staticmethod
+    def requires_allocated_output() -> bool:
+        return True
+
+
+class NewTilesUnifiedTriton2dAttentionCaller(NewTilesUnifiedTriton3dAttentionCaller):
+    @staticmethod
+    def make_call_func(
+        output,
+        query,
+        key_cache,
+        value_cache,
+        key,
+        value,
+        block_tables,
+        seq_lens,
+        ctx_lens,
+        query_lens,
+        start_loc,
+        seq_start_loc,
+        softmax_scale,
+        # kv_cache_dtype,  # unused
+        force_selection=2,
+    ):
+
+        return NewTilesUnifiedTriton3dAttentionCaller.make_call_func(
+            output,
+            query,
+            key_cache,
+            value_cache,
+            key,
+            value,
+            block_tables,
+            seq_lens,
+            ctx_lens,
+            query_lens,
+            start_loc,
+            seq_start_loc,
+            softmax_scale,
+            force_selection=2,
+        )
+
+
+class NewTilesUnifiedTritonAutoAttentionCaller(NewTilesUnifiedTriton3dAttentionCaller):
+    @staticmethod
+    def make_call_func(
+        output,
+        query,
+        key_cache,
+        value_cache,
+        key,
+        value,
+        block_tables,
+        seq_lens,
+        ctx_lens,
+        query_lens,
+        start_loc,
+        seq_start_loc,
+        softmax_scale,
+        # kv_cache_dtype,  # unused
+        force_selection=None,
+    ):
+
+        return NewTilesUnifiedTriton3dAttentionCaller.make_call_func(
+            output,
+            query,
+            key_cache,
+            value_cache,
+            key,
+            value,
+            block_tables,
+            seq_lens,
+            ctx_lens,
+            query_lens,
+            start_loc,
+            seq_start_loc,
+            softmax_scale,
+            force_selection=None,
+        )  # none triggers vllm default behaviour
diff --git a/scripts/setups/prefix_tune_2d.conf b/scripts/setups/prefix_tune_2d.conf
index 1e89c3397..2dd6b8a51 100644
--- a/scripts/setups/prefix_tune_2d.conf
+++ b/scripts/setups/prefix_tune_2d.conf
@@ -12,6 +12,7 @@ PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0, 0.5]
 # PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.5]
 # PREFIX_PREFILL_BATCH_COMPOSITION = ["ALTERNATING"]
 PREFIX_PREFILL_BATCH_COMPOSITION = ["DEC_PRE"]
+# PREFIX_PREFILL_BATCH_COMPOSITION = ["DEC_PRE", "ALTERNATING"]
 
 HEAD_SIZES = [128]  # only powers of 2! for llama2 & 3
 # head_size * head_numbers = hidden_size
@@ -26,8 +27,9 @@ MAX_VALUES = [1.0]
 BENCHMARK_MODES = ["CUDA_EVENTS"]
 
 # IMPLEMENTATION_UT = ["UNF_TRITON_2D"]
-IMPLEMENTATION_UT = ["UNF_TRITON_2D_SIMPLE"]
+# IMPLEMENTATION_UT = ["UNF_TRITON_2D_SIMPLE"]
 # IMPLEMENTATION_UT = ["FLASH_ATTN", "UNF_TRITON_2D"]
+IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D"]
 
 # TRITON_BACKEND_DEBUG = 1
 # STORE_TEST_RESULT_PATH=/results

From ad50b39915b6dc86bf42d667ea4dd7ca2ed4e89c Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Tue, 29 Jul 2025 11:22:30 -0400
Subject: [PATCH 25/61] updating dejavu version

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 triton-dejavu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/triton-dejavu b/triton-dejavu
index 2c5616e18..9de1daa0e 160000
--- a/triton-dejavu
+++ b/triton-dejavu
@@ -1 +1 @@
-Subproject commit 2c5616e1850ed54b24be2f31017f9f4d0fb74727
+Subproject commit 9de1daa0e61b056a23cf4796239629a8f6330995

From 80e9ea09524cc2b4c31ae6432a1bc245a640ada4 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Tue, 29 Jul 2025 12:08:15 -0400
Subject: [PATCH 26/61] updating serving range script

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/bench_vllm_user_range.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/scripts/bench_vllm_user_range.py b/scripts/bench_vllm_user_range.py
index c93c88842..7674450f9 100644
--- a/scripts/bench_vllm_user_range.py
+++ b/scripts/bench_vllm_user_range.py
@@ -40,14 +40,17 @@ def create_dir_if_not_exist(path, mode=0o777):
         except PermissionError as e:
             print(f"can't set permission of directory {path}: {e}")
 
+if len(sys.argv) < 4:
+    print(f"Usage: {sys.argv[0]} <model_path> <testcase_name> <result_path>")
+    exit(-1)
 
 num_users_to_test = [1, 2, 4, 8, 16, 32, 64, 128]
 gpu_name = torch.cuda.get_device_name().replace(" ", "_").replace("/", "_")
 
 # model = "/model/llama3.1-8b/instruct/"
 model = sys.argv[1]
-model_path = f"/models/{model}/"
 testcase_name = sys.argv[2]
+result_path = os.path.abspath(sys.argv[3])
 
 # max_rounds = 128
 max_rounds = 64
@@ -55,14 +58,22 @@ def create_dir_if_not_exist(path, mode=0o777):
 
 timestamp_f = datetime.now().strftime("%Y-%m-%d_%H%M")
 
-# result_dir = f"/results/{model.replace('/','-')}/{gpu_name}/{testcase_name}"
-result_dir = (
-    f"/results/{model.replace('/','-')}/{gpu_name}/{testcase_name}/exp_{timestamp_f}/"
-)
+# result_dir = (
+#     f"/results/{model.replace('/','-')}/{gpu_name}/{testcase_name}/exp_{timestamp_f}/"
+# )
+result_dir = f"{result_path}/{model.replace('/','-')}/{gpu_name}/{testcase_name}/exp_{timestamp_f}/"
+
+bench_script = "/workspace/benchmarks/benchmark_serving.py"
+if not os.path.isfile(bench_script):
+    bench_script = "./vllm-triton-backend/vllm/benchmarks/benchmark_serving.py"
+    if not os.path.isfile(bench_script):
+        print(f"can't find benchmark script benchmark_serving.py")
+        exit(-1)
 
 # os.system(f"mkdir -p {result_dir}")
 create_dir_if_not_exist_recursive(result_dir)
 
+start_time = datetime.now()
 for max_concurrency in num_users_to_test:
     num_prompts = (
         max_num_prompts
@@ -70,12 +81,13 @@ def create_dir_if_not_exist(path, mode=0o777):
         else int(max_rounds * max_concurrency)
     )
     cmd = (
-        f"VLLM_USE_V1=1 python /workspace/benchmarks/benchmark_serving.py "
-        f"--model {model_path} "
+        f"VLLM_USE_V1=1 python {bench_script} "
+        f"--model {model} "
         f"--dataset-name sharegpt --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json "
         f"--save-result --result-dir {result_dir} --max-concurrency {max_concurrency} "
         f"--percentile-metrics ttft,tpot,itl,e2el --metric-percentiles 20,50,80,99 "
         f"--num-prompts {num_prompts} "
+        f"--port 8803"
     )
     print(cmd)
     rv = os.system(cmd)
@@ -83,5 +95,7 @@ def create_dir_if_not_exist(path, mode=0o777):
         print(f"benchmark command returned {rv}, stopping...")
         break
 
+end_time = datetime.now()
 print(f"results stored in: {result_dir}")
 os.system(f"ls -alh {result_dir}")
+print(f"Benchmark time: {end_time-start_time}")

From 0b6469775a313be403e5ae66d605907741b5d9dd Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 30 Jul 2025 05:16:37 -0400
Subject: [PATCH 27/61] preparing cuda graph catpure

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../kernels/triton_unified_attention.py       |  834 +++++--------
 .../kernels/triton_unified_attention_tuned.py | 1030 +++++++++++++++++
 .../kernels/triton_unified_newtiles.py        |    9 +-
 scripts/benchmark.py                          |    4 +-
 scripts/callers/unified_triton.py             |    4 +-
 scripts/setups/prefix_tune_2d.conf            |   10 +-
 6 files changed, 1325 insertions(+), 566 deletions(-)
 create mode 100644 ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention.py
index 2f6911317..713db70f6 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention.py
@@ -8,12 +8,11 @@
 #  - Thomas Parnell <tpa@zurich.ibm.com>
 
 import torch
-import triton
-import triton.language as tl
 
-import os
-import triton_dejavu
-import functools
+from vllm.logger import init_logger
+from vllm.triton_utils import tl, triton
+
+logger = init_logger(__name__)
 
 
 @triton.jit
@@ -30,13 +29,8 @@ def apply_softcap(S, x):
 
 
 @triton.jit
-def find_seq_idx(
-    query_start_len_ptr,
-    target_idx,
-    num_seqs,
-    BLOCK_Q: tl.constexpr,
-    use_q_block_mode: tl.constexpr,
-):
+def find_seq_idx(query_start_len_ptr, target_idx, num_seqs,
+                 BLOCK_Q: tl.constexpr, use_q_block_mode: tl.constexpr):
     left: tl.int32 = 0
     right = num_seqs
     while left < right:
@@ -52,288 +46,61 @@ def find_seq_idx(
     return left - 1
 
 
-# not as lambda, for python3.9
-def fallback_heuristic_dt2(key):
-    tpa_test_q = key[1]
-    tpa_test_k = key[2]
-    # Model trained on max
-    if tpa_test_q < 1024:
-        BLOCK_M = 16
-    else:
-        BLOCK_M = 64
-
-    if tpa_test_k < 64:
-        if tpa_test_k < 32:
-            BLOCK_N = 16
-        else:
-            BLOCK_N = 32
-    else:
-        if tpa_test_q < 256:
-            BLOCK_N = 128
-        else:
-            BLOCK_N = 64
-    ret = triton.Config(
-        {"BLOCK_M": BLOCK_M, "BLOCK_N": BLOCK_N}, num_stages=2, num_warps=8
-    )
-    # num stages = 2, to be on the safe side for MI300
-    return ret
-
-
-def informed_fallback_next(key, cache):
-    # key[0] = max q
-    # key[2] = avg q
-    ret = cache[min(cache.keys(), key=lambda x: abs(x - key[0]))]
-    return ret
-
-
-def prepare_informed_fallback(cache):
-    ret = {int(k[0]): c for k, c in cache.items()}
-    return ret
-
-
-@functools.lru_cache
-def prefill_heuristics_2d(MAX_SEQ_Q, MAX_SEQ_K, AVG_SEQ_Q, AVG_SEQ_K):
-    gpu_name = torch.cuda.get_device_name()
-    # print(f"MAX_SEQ_Q {MAX_SEQ_Q}, MAX_SEQ_K {MAX_SEQ_K}, AVG_SEQ_Q {AVG_SEQ_Q}, AVG_SEQ_K {AVG_SEQ_K}")
-    if "NVIDIA H100" in gpu_name:
-        # # TPA original heuristic
-        # if MAX_SEQ_Q < 1024:
-        #     BLOCK_M = 16
-        # else:
-        #     BLOCK_M = 64
-        # if MAX_SEQ_K < 64:
-        #     if MAX_SEQ_K < 32:
-        #         BLOCK_N = 16
-        #     else:
-        #         BLOCK_N = 32
-        # else:
-        #     if MAX_SEQ_Q < 256:
-        #         BLOCK_N = 128
-        #     else:
-        #         BLOCK_N = 64
-        # config = {'num_stages': 3, 'num_warps': 4,
-        #           'BLOCK_N': BLOCK_N, 'BLOCK_M': BLOCK_M}
-        # dejavu with microbenchmarks
-        # TODO: update to latest tuning with AVG
-        if MAX_SEQ_K <= 96:
-            config = {"num_stages": 4, "num_warps": 4, "BLOCK_N": 32, "BLOCK_M": 16}
-        else:
-            if MAX_SEQ_Q <= 192:
-                if MAX_SEQ_K <= 1536:
-                    config = {
-                        "num_stages": 2,
-                        "num_warps": 8,
-                        "BLOCK_N": 128,
-                        "BLOCK_M": 16,
-                    }
-                else:
-                    config = {
-                        "num_stages": 8,
-                        "num_warps": 8,
-                        "BLOCK_N": 128,
-                        "BLOCK_M": 16,
-                    }
-            else:
-                config = {
-                    "num_stages": 1,
-                    "num_warps": 8,
-                    "BLOCK_N": 128,
-                    "BLOCK_M": 128,
-                }
-    elif "AMD Instinct MI300" in gpu_name:
-        # dejavu with microbenchmarks
-        # TODO: update to latest tuning with AVG
-        if MAX_SEQ_Q <= 384:
-            if MAX_SEQ_K <= 96:
-                config = {"num_stages": 4, "num_warps": 4, "BLOCK_N": 32, "BLOCK_M": 16}
-            else:
-                if MAX_SEQ_K <= 192:
-                    if MAX_SEQ_Q <= 96:
-                        config = {
-                            "num_stages": 2,
-                            "num_warps": 8,
-                            "BLOCK_N": 128,
-                            "BLOCK_M": 16,
-                        }
-                    else:
-                        config = {
-                            "num_stages": 4,
-                            "num_warps": 4,
-                            "BLOCK_N": 32,
-                            "BLOCK_M": 16,
-                        }
-                else:
-                    if MAX_SEQ_Q <= 128:
-                        config = {
-                            "num_stages": 4,
-                            "num_warps": 4,
-                            "BLOCK_N": 32,
-                            "BLOCK_M": 16,
-                        }
-                    else:
-                        if MAX_SEQ_K <= 384:
-                            config = {
-                                "num_stages": 4,
-                                "num_warps": 4,
-                                "BLOCK_N": 32,
-                                "BLOCK_M": 16,
-                            }
-                        else:
-                            config = {
-                                "num_stages": 1,
-                                "num_warps": 4,
-                                "BLOCK_N": 256,
-                                "BLOCK_M": 32,
-                            }
-        else:
-            if MAX_SEQ_K <= 768:
-                config = {"num_stages": 4, "num_warps": 4, "BLOCK_N": 16, "BLOCK_M": 64}
-            else:
-                config = {"num_stages": 1, "num_warps": 2, "BLOCK_N": 64, "BLOCK_M": 64}
-    else:
-        # default
-        config = {
-            "BLOCK_M": 64 if MAX_SEQ_Q > 1 and AVG_SEQ_Q >= 4096 else 16,
-            "BLOCK_N": 16 if MAX_SEQ_K < 128 and AVG_SEQ_Q <= 4096 else 64,
-            "num_warps": 4,
-            "num_stages": 3,
-        }
-    # print(config)
-    return config
-
-
-@triton_dejavu.jitcache(
-    # this list is shorter, since it will be called only within one model
-    check_keys=[
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-    ],
-    check_specialization=["num_seqs"],
-    assume_const=[
-        "scale",
-        "k_scale",
-        "v_scale",
-        "query_stride_1",
-        "output_stride_1",
-        "stride_k_cache_0",
-        "stride_k_cache_1",
-        "stride_k_cache_2",
-        "stride_k_cache_4",
-        "stride_v_cache_0",
-        "stride_v_cache_1",
-        "stride_v_cache_2",
-    ],
-    autotuner_args=["BLOCK_N", "BLOCK_M"],
-)
-@triton_dejavu.autotune(
-    config_space=triton_dejavu.ConfigSpace(
-        {
-            "BLOCK_N": [16, 32, 64, 128, 256, 512],
-            "BLOCK_M": [16, 32, 64, 128, 256, 512],
-        },
-        num_warps=[2, 4, 8],
-        num_stages=[1, 2, 4, 6, 8],
-    ),
-    # this list is longer, since it would be used for multiple models
-    key=[
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-    ],
-    custom_data_storage=os.path.abspath(
-        os.path.join(os.path.dirname(__file__), "dejavu_data")
-    ),
-    use_cuda_graph=True,
-    use_bo=True,
-    search_max_search_t=360,
-    informed_fallback=informed_fallback_next,
-    prepare_informed_fallback=prepare_informed_fallback,
-    fallback_heuristic=fallback_heuristic_dt2,
-    ignore_dtypes=True,
-)
-# @triton.heuristics(
-#        {
-#            "BLOCK_M": lambda args: prefill_heuristics_2d(args['MAX_SEQ_Q'], args['MAX_SEQ_K'], args['AVG_SEQ_Q'], args['AVG_SEQ_K'])['BLOCK_M'],
-#            "BLOCK_N": lambda args: prefill_heuristics_2d(args['MAX_SEQ_Q'], args['MAX_SEQ_K'], args['AVG_SEQ_Q'], args['AVG_SEQ_K'])['BLOCK_N'],
-#            "num_warps": lambda args: prefill_heuristics_2d(args['MAX_SEQ_Q'], args['MAX_SEQ_K'], args['AVG_SEQ_Q'], args['AVG_SEQ_K'])['num_warps'],
-#            "num_stages": lambda args: prefill_heuristics_2d(args['MAX_SEQ_Q'], args['MAX_SEQ_K'], args['AVG_SEQ_Q'], args['AVG_SEQ_K'])['num_stages'],
-#         }
-# )
 @triton.jit
 def kernel_unified_attention_2d(
-    output_ptr,  # [num_tokens, num_query_heads, head_size]
-    query_ptr,  # [num_tokens, num_query_heads, head_size]
-    key_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
-    value_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
-    block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
-    seq_lens_ptr,  # [num_seqs]
-    alibi_slopes_ptr,  # [num_query_heads]
-    scale,  # float32
-    k_scale,  # float32
-    v_scale,  # float32
-    softcap,  # float32
-    num_query_heads: tl.constexpr,  # int
-    num_queries_per_kv: tl.constexpr,  # int
-    block_table_stride: tl.int64,  # int
-    query_stride_0: tl.int64,  # int
-    query_stride_1: tl.int64,  # int, should be equal to head_size
-    output_stride_0: tl.int64,  # int
-    output_stride_1: tl.int64,  # int, should be equal to head_size
-    BLOCK_SIZE: tl.constexpr,  # int
-    HEAD_SIZE: tl.constexpr,  # int
-    HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
-    USE_ALIBI_SLOPES: tl.constexpr,  # bool
-    USE_SOFTCAP: tl.constexpr,  # bool
-    SLIDING_WINDOW: tl.constexpr,  # int
-    stride_k_cache_0: tl.int64,  # int
-    stride_k_cache_1: tl.int64,  # int
-    stride_k_cache_2: tl.int64,  # int
-    stride_k_cache_3: tl.constexpr,  # int
-    stride_v_cache_0: tl.int64,  # int
-    stride_v_cache_1: tl.int64,  # int
-    stride_v_cache_2: tl.int64,  # int
-    stride_v_cache_3: tl.constexpr,  # int
-    query_start_len_ptr,  # [num_seqs+1]
-    num_seqs: tl.int32,
-    # used as input to the autotuner/heuristics
-    MAX_SEQ_Q: tl.constexpr,
-    MAX_SEQ_K: tl.constexpr,
-    AVG_SEQ_Q: tl.constexpr,
-    AVG_SEQ_K: tl.constexpr,
-    # autotuner args
-    BLOCK_M: tl.constexpr,  # int
-    BLOCK_N: tl.constexpr,  # int
+        output_ptr,  # [num_tokens, num_query_heads, head_size]
+        query_ptr,  # [num_tokens, num_query_heads, head_size]
+        key_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+        value_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+        seq_lens_ptr,  # [num_seqs]
+        alibi_slopes_ptr,  # [num_query_heads]
+        scale,  # float32
+        k_scale,  # float32
+        v_scale,  # float32
+        softcap,  # float32
+        num_query_heads: tl.constexpr,  # int
+        num_queries_per_kv: tl.constexpr,  # int
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
+        BLOCK_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        USE_SOFTCAP: tl.constexpr,  # bool
+        SLIDING_WINDOW: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.constexpr,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.constexpr,  # int
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        num_seqs: tl.int32,
+        BLOCK_M: tl.constexpr,  # int
 ):
-
     q_block_global_idx = tl.program_id(0)
     kv_head_idx = tl.program_id(1)
-    BLOCK_Q = BLOCK_M // num_queries_per_kv
 
-    seq_idx = find_seq_idx(
-        query_start_len_ptr, q_block_global_idx, num_seqs, BLOCK_Q, True
-    )
+    seq_idx = find_seq_idx(query_start_len_ptr, q_block_global_idx, num_seqs,
+                           BLOCK_Q, True)
 
-    q_block_start_idx = tl.load(query_start_len_ptr + seq_idx) // BLOCK_Q + seq_idx
+    q_block_start_idx = tl.load(query_start_len_ptr +
+                                seq_idx) // BLOCK_Q + seq_idx
 
     q_block_local_idx = q_block_global_idx - q_block_start_idx
 
     cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
     cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1)
 
-    cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index
+    cur_batch_query_len = cur_batch_in_all_stop_index \
+        - cur_batch_in_all_start_index
 
     if q_block_local_idx * BLOCK_Q >= cur_batch_query_len:
         return
@@ -343,12 +110,10 @@ def kernel_unified_attention_2d(
     query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
 
     query_offset_0 = cur_batch_in_all_start_index + query_pos
-    query_offset_1 = kv_head_idx * num_queries_per_kv + offs_m % num_queries_per_kv
-    query_offset = (
-        query_offset_0[:, None] * query_stride_0
-        + query_offset_1[:, None] * query_stride_1
-        + offs_d[None, :]
-    )
+    query_offset_1 = kv_head_idx * num_queries_per_kv + \
+        offs_m % num_queries_per_kv
+    query_offset = (query_offset_0[:, None] * query_stride_0 +
+                    query_offset_1[:, None] * query_stride_1 + offs_d[None, :])
 
     dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1)
     query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1)
@@ -375,61 +140,45 @@ def kernel_unified_attention_2d(
 
     # alibi slope for this head
     if USE_ALIBI_SLOPES:
-        alibi_slope = tl.load(
-            alibi_slopes_ptr + query_offset_1, mask=query_mask_1, other=0.0
-        )
+        alibi_slope = tl.load(alibi_slopes_ptr + query_offset_1,
+                              mask=query_mask_1,
+                              other=0.0)
 
     # compute the length of the longest sequence prefix spanned by any
     # query token in the current q_block (q_block_local_idx)
-    max_seq_prefix_len = (
-        context_len
-        + q_block_local_idx * BLOCK_Q
-        + (BLOCK_M - 1) // num_queries_per_kv
-        + 1
-    )
+    max_seq_prefix_len = context_len + q_block_local_idx * BLOCK_Q + (
+        BLOCK_M - 1) // num_queries_per_kv + 1
 
     # adjust for potential padding in the last q_block by considering the
     # actual sequence length
     max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
 
-    offs_n = tl.arange(0, BLOCK_N)
-
-    # iterate through tiles (below the mask)
-    # The loop iterates only until the longest sequence. Due to causal
-    # masking, blocks beyond this prefix can be skipped.
-    for start_n in range(0, max_seq_prefix_len, BLOCK_N):
+    # calculate the number of tiles (blocks) that need to be processed to
+    # cover the longest sequence prefix (due to causal masking, blocks beyond
+    # this prefix can be skipped)
+    num_blocks = cdiv_fn(max_seq_prefix_len, BLOCK_SIZE)
 
-        start_n = tl.multiple_of(start_n, BLOCK_N)
+    # iterate through tiles
+    for j in range(0, num_blocks):
 
-        physical_block_idx = tl.load(
-            block_tables_ptr + block_table_offset + (start_n + offs_n) // BLOCK_SIZE,
-            mask=(start_n + offs_n) < seq_len,
-            other=0,
-        )
+        physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j)
 
-        v_offset = (
-            physical_block_idx[:, None] * stride_v_cache_0
-            + kv_head_idx * stride_v_cache_2
-            + offs_d[None, :] * stride_v_cache_3
-            + (offs_n[:, None] % BLOCK_SIZE) * stride_v_cache_1
-        )
+        offs_n = tl.arange(0, BLOCK_SIZE)
 
-        k_offset = (
-            physical_block_idx[None, :] * stride_k_cache_0
-            + kv_head_idx * stride_k_cache_2
-            + offs_d[:, None] * stride_k_cache_3
-            + (offs_n[None, :] % BLOCK_SIZE) * stride_k_cache_1
-        )
+        v_offset = (physical_block_idx * stride_v_cache_0 +
+                    kv_head_idx * stride_v_cache_2 +
+                    offs_d[None, :] * stride_v_cache_3 +
+                    offs_n[:, None] * stride_v_cache_1)
 
-        seq_offset_load = start_n + offs_n
-        load_mask = seq_offset_load < max_seq_prefix_len
+        k_offset = (physical_block_idx * stride_k_cache_0 +
+                    kv_head_idx * stride_k_cache_2 +
+                    offs_d[:, None] * stride_k_cache_3 +
+                    offs_n[None, :] * stride_k_cache_1)
 
-        # K : (HEAD_SIZE_PADDED, BLOCK_N)
-        K_load = tl.load(
-            key_cache_ptr + k_offset,
-            mask=dim_mask[:, None] & load_mask[None, :],
-            other=0.0,
-        )
+        # K : (HEAD_SIZE, BLOCK_SIZE)
+        K_load = tl.load(key_cache_ptr + k_offset,
+                         mask=dim_mask[:, None],
+                         other=0.0)
 
         if K_load.dtype.is_fp8():
             if Q.dtype.is_fp8():
@@ -439,12 +188,10 @@ def kernel_unified_attention_2d(
         else:
             K = K_load
 
-        # V : (BLOCK_N, HEAD_SIZE_PADDED)
-        V_load = tl.load(
-            value_cache_ptr + v_offset,
-            mask=dim_mask[None, :] & load_mask[:, None],
-            other=0.0,
-        )
+        # V : (BLOCK_SIZE, HEAD_SIZE)
+        V_load = tl.load(value_cache_ptr + v_offset,
+                         mask=dim_mask[None, :],
+                         other=0.0)
 
         if V_load.dtype.is_fp8():
             if Q.dtype.is_fp8():
@@ -454,29 +201,24 @@ def kernel_unified_attention_2d(
         else:
             V = V_load
 
-        seq_offset = start_n + tl.arange(0, BLOCK_N)
+        seq_offset = j * BLOCK_SIZE + offs_n
 
-        # seq_mask: (BLOCK_M, BLOCK_N)
         seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
 
-        # S : (BLOCK_M, BLOCK_N)
-        S = tl.zeros(shape=(BLOCK_M, BLOCK_N), dtype=tl.float32)
+        # S : (BLOCK_M, BLOCK_SIZE)
+        S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32)
 
         S += scale * tl.dot(Q, K)
 
         if USE_SOFTCAP:
             S = apply_softcap(S, softcap)
 
-        S = tl.where(
-            query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf")
-        )
+        S = tl.where(query_mask_1[:, None] & query_mask_0[:, None] & seq_mask,
+                     S, float("-inf"))
 
         if SLIDING_WINDOW > 0:
-            S = tl.where(
-                (context_len + query_pos[:, None] - seq_offset) < SLIDING_WINDOW,
-                S,
-                float("-inf"),
-            )
+            S = tl.where((context_len + query_pos[:, None] - seq_offset)
+                         < SLIDING_WINDOW, S, float("-inf"))
 
         if USE_ALIBI_SLOPES:
             S += alibi_slope[:, None] * (seq_offset - context_len)
@@ -488,7 +230,7 @@ def kernel_unified_attention_2d(
         # the entire row. In this case we need to set m_j 0 to avoid NaN
         m_j = tl.where(m_j > float("-inf"), m_j, 0.0)
 
-        # P : (BLOCK_M, BLOCK_N)
+        # P : (BLOCK_M, BLOCK_SIZE)
         P = tl.exp(S - m_j[:, None])
 
         # l_j : (BLOCK_M,)
@@ -510,11 +252,9 @@ def kernel_unified_attention_2d(
     # epilogue
     acc = acc / L[:, None]
 
-    output_offset = (
-        query_offset_0[:, None] * output_stride_0
-        + query_offset_1[:, None] * output_stride_1
-        + offs_d[None, :]
-    )
+    output_offset = (query_offset_0[:, None] * output_stride_0 +
+                     query_offset_1[:, None] * output_stride_1 +
+                     offs_d[None, :])
 
     tl.store(
         output_ptr + output_offset,
@@ -525,61 +265,62 @@ def kernel_unified_attention_2d(
 
 @triton.jit
 def kernel_unified_attention_3d(
-    segm_output_ptr,
-    # [num_tokens, num_query_heads, num_segments, head_size]
-    segm_max_ptr,  # [num_tokens, num_query_heads, num_segments]
-    segm_expsum_ptr,  # [num_tokens, num_query_heads, num_segments]
-    query_ptr,  # [num_tokens, num_query_heads, head_size]
-    key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
-    value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
-    block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
-    seq_lens_ptr,  # [num_seqs]
-    alibi_slopes_ptr,  # [num_query_heads]
-    scale,  # float32
-    k_scale,  # float32
-    v_scale,  # float32
-    softcap,  # float32
-    num_query_heads: tl.constexpr,  # int
-    num_queries_per_kv: tl.constexpr,  # int
-    block_table_stride: tl.int64,  # int
-    query_stride_0: tl.int64,  # int
-    query_stride_1: tl.int64,  # int, should be equal to head_size
-    BLOCK_SIZE: tl.constexpr,  # int
-    HEAD_SIZE: tl.constexpr,  # int
-    HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
-    USE_ALIBI_SLOPES: tl.constexpr,  # bool
-    USE_SOFTCAP: tl.constexpr,  # bool
-    SLIDING_WINDOW: tl.constexpr,  # int
-    stride_k_cache_0: tl.int64,  # int
-    stride_k_cache_1: tl.int64,  # int
-    stride_k_cache_2: tl.int64,  # int
-    stride_k_cache_3: tl.constexpr,  # int
-    stride_v_cache_0: tl.int64,  # int
-    stride_v_cache_1: tl.int64,  # int
-    stride_v_cache_2: tl.int64,  # int
-    stride_v_cache_3: tl.constexpr,  # int
-    query_start_len_ptr,  # [num_seqs+1]
-    BLOCK_Q: tl.constexpr,  # int
-    num_seqs: tl.int32,
-    BLOCK_M: tl.constexpr,  # int
-    NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+        segm_output_ptr,
+        # [num_tokens, num_query_heads, num_segments, head_size]
+        segm_max_ptr,  # [num_tokens, num_query_heads, num_segments]
+        segm_expsum_ptr,  # [num_tokens, num_query_heads, num_segments]
+        query_ptr,  # [num_tokens, num_query_heads, head_size]
+        key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
+        value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
+        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+        seq_lens_ptr,  # [num_seqs]
+        alibi_slopes_ptr,  # [num_query_heads]
+        scale,  # float32
+        k_scale,  # float32
+        v_scale,  # float32
+        softcap,  # float32
+        num_query_heads: tl.constexpr,  # int
+        num_queries_per_kv: tl.constexpr,  # int
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        BLOCK_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        USE_SOFTCAP: tl.constexpr,  # bool
+        SLIDING_WINDOW: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.constexpr,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.constexpr,  # int
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        num_seqs: tl.int32,
+        BLOCK_M: tl.constexpr,  # int
+        NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
 ):
     q_block_global_idx = tl.program_id(0)
     kv_head_idx = tl.program_id(1)
     segm_idx = tl.program_id(2)
 
-    seq_idx = find_seq_idx(
-        query_start_len_ptr, q_block_global_idx, num_seqs, BLOCK_Q, True
-    )
+    seq_idx = find_seq_idx(query_start_len_ptr, q_block_global_idx, num_seqs,
+                           BLOCK_Q, True)
 
-    q_block_start_idx = tl.load(query_start_len_ptr + seq_idx) // BLOCK_Q + seq_idx
+    q_block_start_idx = tl.load(query_start_len_ptr +
+                                seq_idx) // BLOCK_Q + seq_idx
 
     q_block_local_idx = q_block_global_idx - q_block_start_idx
 
     cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
     cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1)
 
-    cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index
+    cur_batch_query_len = cur_batch_in_all_stop_index \
+        - cur_batch_in_all_start_index
 
     if q_block_local_idx * BLOCK_Q >= cur_batch_query_len:
         return
@@ -600,13 +341,11 @@ def kernel_unified_attention_3d(
     query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
 
     query_offset_0 = cur_batch_in_all_start_index + query_pos
-    query_offset_1 = kv_head_idx * num_queries_per_kv + offs_m % num_queries_per_kv
+    query_offset_1 = kv_head_idx * num_queries_per_kv + \
+        offs_m % num_queries_per_kv
 
-    query_offset = (
-        query_offset_0[:, None] * query_stride_0
-        + query_offset_1[:, None] * query_stride_1
-        + offs_d[None, :]
-    )
+    query_offset = (query_offset_0[:, None] * query_stride_0 +
+                    query_offset_1[:, None] * query_stride_1 + offs_d[None, :])
 
     dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1)
     query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1)
@@ -630,37 +369,35 @@ def kernel_unified_attention_3d(
 
     # alibi slope for this head
     if USE_ALIBI_SLOPES:
-        alibi_slope = tl.load(
-            alibi_slopes_ptr + query_offset_1, mask=query_mask_1, other=0.0
-        )
+        alibi_slope = tl.load(alibi_slopes_ptr + query_offset_1,
+                              mask=query_mask_1,
+                              other=0.0)
 
     num_blocks = cdiv_fn(seq_len, BLOCK_SIZE)
 
     # iterate through tiles within current segment
     for j in range(
-        segm_idx * blocks_per_segment,
-        min((segm_idx + 1) * blocks_per_segment, num_blocks),
+            segm_idx * blocks_per_segment,
+            min((segm_idx + 1) * blocks_per_segment, num_blocks),
     ):
         physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j)
 
         offs_n = tl.arange(0, BLOCK_SIZE)
 
-        v_offset = (
-            physical_block_idx * stride_v_cache_0
-            + kv_head_idx * stride_v_cache_2
-            + offs_d[None, :] * stride_v_cache_3
-            + offs_n[:, None] * stride_v_cache_1
-        )
+        v_offset = (physical_block_idx * stride_v_cache_0 +
+                    kv_head_idx * stride_v_cache_2 +
+                    offs_d[None, :] * stride_v_cache_3 +
+                    offs_n[:, None] * stride_v_cache_1)
 
-        k_offset = (
-            physical_block_idx * stride_k_cache_0
-            + kv_head_idx * stride_k_cache_2
-            + offs_d[:, None] * stride_k_cache_3
-            + offs_n[None, :] * stride_k_cache_1
-        )
+        k_offset = (physical_block_idx * stride_k_cache_0 +
+                    kv_head_idx * stride_k_cache_2 +
+                    offs_d[:, None] * stride_k_cache_3 +
+                    offs_n[None, :] * stride_k_cache_1)
 
         # K : (HEAD_SIZE, BLOCK_SIZE)
-        K_load = tl.load(key_cache_ptr + k_offset, mask=dim_mask[:, None], other=0.0)
+        K_load = tl.load(key_cache_ptr + k_offset,
+                         mask=dim_mask[:, None],
+                         other=0.0)
 
         if K_load.dtype.is_fp8():
             if Q.dtype.is_fp8():
@@ -671,7 +408,9 @@ def kernel_unified_attention_3d(
             K = K_load
 
         # V : (BLOCK_SIZE, HEAD_SIZE)
-        V_load = tl.load(value_cache_ptr + v_offset, mask=dim_mask[None, :], other=0.0)
+        V_load = tl.load(value_cache_ptr + v_offset,
+                         mask=dim_mask[None, :],
+                         other=0.0)
 
         if V_load.dtype.is_fp8():
             if Q.dtype.is_fp8():
@@ -693,16 +432,12 @@ def kernel_unified_attention_3d(
         if USE_SOFTCAP:
             S = apply_softcap(S, softcap)
 
-        S = tl.where(
-            query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf")
-        )
+        S = tl.where(query_mask_1[:, None] & query_mask_0[:, None] & seq_mask,
+                     S, float("-inf"))
 
         if SLIDING_WINDOW > 0:
-            S = tl.where(
-                (context_len + query_pos[:, None] - seq_offset) < SLIDING_WINDOW,
-                S,
-                float("-inf"),
-            )
+            S = tl.where((context_len + query_pos[:, None] - seq_offset)
+                         < SLIDING_WINDOW, S, float("-inf"))
 
         if USE_ALIBI_SLOPES:
             S += alibi_slope[:, None] * (seq_offset - context_len)
@@ -734,52 +469,49 @@ def kernel_unified_attention_3d(
         acc += tl.dot(P.to(V.dtype), V)
 
     segm_output_offset = (
-        query_offset_0[:, None].to(tl.int64)
-        * (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED)
-        + query_offset_1[:, None] * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED)
-        + segm_idx * HEAD_SIZE_PADDED
-        + tl.arange(0, HEAD_SIZE_PADDED)[None, :]
-    )
+        query_offset_0[:, None].to(tl.int64) *
+        (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        query_offset_1[:, None] * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        segm_idx * HEAD_SIZE_PADDED + tl.arange(0, HEAD_SIZE_PADDED)[None, :])
     tl.store(
         segm_output_ptr + segm_output_offset,
         acc,
         mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
     )
-    segm_offset = (
-        query_offset_0.to(tl.int64) * (num_query_heads * NUM_SEGMENTS_PER_SEQ)
-        + query_offset_1 * NUM_SEGMENTS_PER_SEQ
-        + segm_idx
-    )
+    segm_offset = (query_offset_0.to(tl.int64) *
+                   (num_query_heads * NUM_SEGMENTS_PER_SEQ) +
+                   query_offset_1 * NUM_SEGMENTS_PER_SEQ + segm_idx)
     tl.store(segm_max_ptr + segm_offset, M, mask=query_mask_0 & query_mask_1)
-    tl.store(segm_expsum_ptr + segm_offset, L, mask=query_mask_0 & query_mask_1)
+    tl.store(segm_expsum_ptr + segm_offset,
+             L,
+             mask=query_mask_0 & query_mask_1)
 
 
 @triton.jit
 def reduce_segments(
-    output_ptr,  # [num_tokens, num_query_heads, head_size]
-    segm_output_ptr,
-    # [num_tokens, num_query_heads, max_num_segments, head_size]
-    segm_max_ptr,  # [num_tokens, num_query_heads, max_num_segments]
-    segm_expsum_ptr,  # [num_tokens, num_query_heads, max_num_segments]
-    seq_lens_ptr,  # [num_seqs]
-    num_seqs,  # int
-    num_query_heads: tl.constexpr,  # int
-    output_stride_0: tl.int64,  # int
-    output_stride_1: tl.int64,  # int, should be equal to head_size
-    block_table_stride: tl.int64,  # int
-    BLOCK_SIZE: tl.constexpr,  # int
-    HEAD_SIZE: tl.constexpr,  # int, must be power of 2
-    HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
-    query_start_len_ptr,  # [num_seqs+1]
-    BLOCK_Q: tl.constexpr,  # int
-    NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+        output_ptr,  # [num_tokens, num_query_heads, head_size]
+        segm_output_ptr,
+        #[num_tokens, num_query_heads, max_num_segments, head_size]
+        segm_max_ptr,  # [num_tokens, num_query_heads, max_num_segments]
+        segm_expsum_ptr,  # [num_tokens, num_query_heads, max_num_segments]
+        seq_lens_ptr,  # [num_seqs]
+        num_seqs,  # int
+        num_query_heads: tl.constexpr,  # int
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
+        block_table_stride: tl.int64,  # int
+        BLOCK_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int, must be power of 2
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
 ):
     query_token_idx = tl.program_id(0)
     query_head_idx = tl.program_id(1)
 
-    seq_idx = find_seq_idx(
-        query_start_len_ptr, query_token_idx, num_seqs, BLOCK_Q, False
-    )
+    seq_idx = find_seq_idx(query_start_len_ptr, query_token_idx, num_seqs,
+                           BLOCK_Q, False)
 
     # sequence len for this particular sequence
     seq_len = tl.load(seq_lens_ptr + seq_idx)
@@ -791,32 +523,34 @@ def reduce_segments(
     # create masks for subsequent loads
     act_num_segments = cdiv_fn(seq_len, blocks_per_segment * BLOCK_SIZE)
     segm_mask = tl.arange(0, NUM_SEGMENTS_PER_SEQ) < tl.full(
-        [NUM_SEGMENTS_PER_SEQ], act_num_segments, dtype=tl.int32
-    )
-    dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1, 0).to(tl.int1)
+        [NUM_SEGMENTS_PER_SEQ], act_num_segments, dtype=tl.int32)
+    dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1,
+                        0).to(tl.int1)
 
     # load segment maxima
-    segm_offset = (
-        query_token_idx.to(tl.int64) * (num_query_heads * NUM_SEGMENTS_PER_SEQ)
-        + query_head_idx * NUM_SEGMENTS_PER_SEQ
-        + tl.arange(0, NUM_SEGMENTS_PER_SEQ)
-    )
-    segm_max = tl.load(segm_max_ptr + segm_offset, mask=segm_mask, other=float("-inf"))
+    segm_offset = (query_token_idx.to(tl.int64) *
+                   (num_query_heads * NUM_SEGMENTS_PER_SEQ) +
+                   query_head_idx * NUM_SEGMENTS_PER_SEQ +
+                   tl.arange(0, NUM_SEGMENTS_PER_SEQ))
+    segm_max = tl.load(segm_max_ptr + segm_offset,
+                       mask=segm_mask,
+                       other=float("-inf"))
     overall_max = tl.max(segm_max)
 
     # load and rescale segment exp sums
-    segm_expsum = tl.load(segm_expsum_ptr + segm_offset, mask=segm_mask, other=0.0)
+    segm_expsum = tl.load(segm_expsum_ptr + segm_offset,
+                          mask=segm_mask,
+                          other=0.0)
     segm_expsum = segm_expsum * tl.exp(segm_max - overall_max)
     overall_expsum = tl.sum(segm_expsum)
 
     # load, rescale, and add segment attention outputs
     segm_output_offset = (
-        query_token_idx.to(tl.int64)
-        * (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED)
-        + query_head_idx * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED)
-        + tl.arange(0, NUM_SEGMENTS_PER_SEQ)[:, None] * HEAD_SIZE_PADDED
-        + tl.arange(0, HEAD_SIZE_PADDED)[None, :]
-    )
+        query_token_idx.to(tl.int64) *
+        (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        query_head_idx * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+        tl.arange(0, NUM_SEGMENTS_PER_SEQ)[:, None] * HEAD_SIZE_PADDED +
+        tl.arange(0, HEAD_SIZE_PADDED)[None, :])
     segm_output = tl.load(
         segm_output_ptr + segm_output_offset,
         mask=segm_mask[:, None] & dim_mask[None, :],
@@ -828,11 +562,9 @@ def reduce_segments(
     acc = tl.where(overall_expsum == 0.0, 0.0, acc_sum / overall_expsum)
 
     # write result
-    output_offset = (
-        query_token_idx * output_stride_0
-        + query_head_idx * output_stride_1
-        + tl.arange(0, HEAD_SIZE_PADDED)
-    )
+    output_offset = (query_token_idx * output_stride_0 +
+                     query_head_idx * output_stride_1 +
+                     tl.arange(0, HEAD_SIZE_PADDED))
     tl.store(output_ptr + output_offset, acc, mask=dim_mask)
 
 
@@ -845,8 +577,6 @@ def unified_attention(
     max_seqlen_q,
     seqused_k,
     max_seqlen_k,
-    avg_seqlen_q,
-    avg_seqlen_k,
     softmax_scale,
     causal,
     window_size,
@@ -862,9 +592,8 @@ def unified_attention(
     assert q_descale is None, "Q scales not supported"
 
     block_size = v.shape[1]
-    assert (
-        q.element_size() >= 2 or block_size >= 32
-    ), "Block size must be at least 32 for fp8"
+    assert q.element_size() >= 2 or block_size >= 32, \
+        "Block size must be at least 32 for fp8"
 
     use_alibi_slopes = alibi_slopes is not None
 
@@ -875,20 +604,27 @@ def unified_attention(
     num_queries_per_kv = num_query_heads // num_kv_heads
     head_size = q.shape[2]
 
-    MAX_SEQ_Q = triton.next_power_of_2(int(max_seqlen_q))
-    MAX_SEQ_K = triton.next_power_of_2(int(max_seqlen_k))
-    AVG_SEQ_Q = triton.next_power_of_2(int(avg_seqlen_q))
-    AVG_SEQ_K = triton.next_power_of_2(int(avg_seqlen_k))
+    BLOCK_M = 16
+    BLOCK_Q = BLOCK_M // num_queries_per_kv
 
-    # if batch contains a prefill
-    if max_seqlen_q > 1 or force_selection == 2 and force_selection != 3:
+    # Ideally we would launch with kernel with:
+    # \sum_i[ceil(query_len[i] / BLOCK_Q)] blocks.
+    # However, it is slow to realize the query_lens on cpu.
+    # Instead we use upper-bound:
+    # \sum_i[ceil(query_len[i] / BLOCK_Q)]
+    #   <= \sum_i[floor(query_len[i] / BLOCK_Q) + 1]
+    #    = \sum_i[floor(query_len[i] / BLOCK_Q)] + num_seqs
+    #   <= floor(\sum_i(query_len[i]) / BLOCK_Q) + num_seqs
+    #    = floor(q.shape[0] / BLOCK_Q) + num_seqs
+    total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs
 
-        grid = lambda META: (
-            q.shape[0] // (META["BLOCK_M"] // num_queries_per_kv) + num_seqs,
+    # if batch contains a prefill
+    # if (max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128 or force_selection == 2) and force_selection != 3:
+    if force_selection == 2:
+        kernel_unified_attention_2d[(
+            total_num_q_blocks,
             num_kv_heads,
-        )
-
-        kernel_unified_attention_2d[grid](
+        )](
             output_ptr=out,
             query_ptr=q,
             key_cache_ptr=k,
@@ -922,27 +658,11 @@ def unified_attention(
             stride_v_cache_2=v.stride(2),
             stride_v_cache_3=v.stride(3),
             query_start_len_ptr=cu_seqlens_q,
+            BLOCK_Q=BLOCK_Q,
             num_seqs=num_seqs,
-            MAX_SEQ_Q=MAX_SEQ_Q,
-            MAX_SEQ_K=MAX_SEQ_K,
-            AVG_SEQ_Q=AVG_SEQ_Q,
-            AVG_SEQ_K=AVG_SEQ_K,
+            BLOCK_M=BLOCK_M,
         )
-    else:
-        BLOCK_M = 64 if max_seqlen_q > 1 and avg_seqlen_q >= 4096 else 16
-        BLOCK_Q = BLOCK_M // num_queries_per_kv
-
-        # Ideally we would launch with kernel with:
-        # \sum_i[ceil(query_len[i] / BLOCK_Q)] blocks.
-        # However, it is slow to realize the query_lens on cpu.
-        # Instead we use upper-bound:
-        # \sum_i[ceil(query_len[i] / BLOCK_Q)]
-        #   <= \sum_i[floor(query_len[i] / BLOCK_Q) + 1]
-        #    = \sum_i[floor(query_len[i] / BLOCK_Q)] + num_seqs
-        #   <= floor(\sum_i(query_len[i]) / BLOCK_Q) + num_seqs
-        #    = floor(q.shape[0] / BLOCK_Q) + num_seqs
-        total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs
-
+    elif force_selection == 3:
         # for initial version, NUM_SEGMENTS = 16 is chosen as a default
         # value that showed good performance in tests
         NUM_SEGMENTS = 16
@@ -970,45 +690,46 @@ def unified_attention(
             device=q.device,
         )
 
-        kernel_unified_attention_3d[(total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)](
-            segm_output_ptr=segm_output,
-            segm_max_ptr=segm_max,
-            segm_expsum_ptr=segm_expsum,
-            query_ptr=q,
-            key_cache_ptr=k,
-            value_cache_ptr=v,
-            block_tables_ptr=block_table,
-            seq_lens_ptr=seqused_k,
-            alibi_slopes_ptr=alibi_slopes,
-            scale=softmax_scale,
-            k_scale=k_descale,
-            v_scale=v_descale,
-            softcap=softcap,
-            num_query_heads=num_query_heads,
-            num_queries_per_kv=num_queries_per_kv,
-            block_table_stride=block_table.stride(0),
-            query_stride_0=q.stride(0),
-            query_stride_1=q.stride(1),
-            BLOCK_SIZE=block_size,
-            HEAD_SIZE=head_size,
-            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
-            USE_ALIBI_SLOPES=use_alibi_slopes,
-            USE_SOFTCAP=(softcap > 0),
-            SLIDING_WINDOW=(1 + window_size[0]),
-            stride_k_cache_0=k.stride(0),
-            stride_k_cache_1=k.stride(1),
-            stride_k_cache_2=k.stride(2),
-            stride_k_cache_3=k.stride(3),
-            stride_v_cache_0=v.stride(0),
-            stride_v_cache_1=v.stride(1),
-            stride_v_cache_2=v.stride(2),
-            stride_v_cache_3=v.stride(3),
-            query_start_len_ptr=cu_seqlens_q,
-            BLOCK_Q=BLOCK_Q,
-            num_seqs=num_seqs,
-            BLOCK_M=BLOCK_M,
-            NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
-        )
+        kernel_unified_attention_3d[(
+            total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)](
+                segm_output_ptr=segm_output,
+                segm_max_ptr=segm_max,
+                segm_expsum_ptr=segm_expsum,
+                query_ptr=q,
+                key_cache_ptr=k,
+                value_cache_ptr=v,
+                block_tables_ptr=block_table,
+                seq_lens_ptr=seqused_k,
+                alibi_slopes_ptr=alibi_slopes,
+                scale=softmax_scale,
+                k_scale=k_descale,
+                v_scale=v_descale,
+                softcap=softcap,
+                num_query_heads=num_query_heads,
+                num_queries_per_kv=num_queries_per_kv,
+                block_table_stride=block_table.stride(0),
+                query_stride_0=q.stride(0),
+                query_stride_1=q.stride(1),
+                BLOCK_SIZE=block_size,
+                HEAD_SIZE=head_size,
+                HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+                USE_ALIBI_SLOPES=use_alibi_slopes,
+                USE_SOFTCAP=(softcap > 0),
+                SLIDING_WINDOW=(1 + window_size[0]),
+                stride_k_cache_0=k.stride(0),
+                stride_k_cache_1=k.stride(1),
+                stride_k_cache_2=k.stride(2),
+                stride_k_cache_3=k.stride(3),
+                stride_v_cache_0=v.stride(0),
+                stride_v_cache_1=v.stride(1),
+                stride_v_cache_2=v.stride(2),
+                stride_v_cache_3=v.stride(3),
+                query_start_len_ptr=cu_seqlens_q,
+                BLOCK_Q=BLOCK_Q,
+                num_seqs=num_seqs,
+                BLOCK_M=BLOCK_M,
+                NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+            )
 
         reduce_segments[(q.shape[0], num_query_heads)](
             output_ptr=out,
@@ -1028,3 +749,6 @@ def unified_attention(
             BLOCK_Q=BLOCK_Q,
             NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
         )
+    else:
+        raise RuntimeError("currently, we need to force a kernel selection")
+
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py
new file mode 100644
index 000000000..7b1a17d23
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py
@@ -0,0 +1,1030 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Authors:
+#  - Burkhard Ringlein <ngl@zurich.ibm.com>
+#  - Jan van Lunteren <jvl@zurich.ibm.com>
+#  - Chih-Chieh Yang <chih.chieh.yang@ibm.com>
+#  - Thomas Parnell <tpa@zurich.ibm.com>
+
+import torch
+import triton
+import triton.language as tl
+
+import os
+import triton_dejavu
+import functools
+
+
+@triton.jit
+def cdiv_fn(x, y):
+    return (x + y - 1) // y
+
+
+@triton.jit
+def apply_softcap(S, x):
+    Sdiv = S / x
+    p1 = tl.exp(Sdiv)
+    p2 = tl.exp(-Sdiv)
+    return x * (p1 - p2) / (p1 + p2)
+
+
+@triton.jit
+def find_seq_idx(
+    query_start_len_ptr,
+    target_idx,
+    num_seqs,
+    BLOCK_Q: tl.constexpr,
+    use_q_block_mode: tl.constexpr,
+):
+    left: tl.int32 = 0
+    right = num_seqs
+    while left < right:
+        mid = (left + right) // 2
+        val = tl.load(query_start_len_ptr + mid)
+        mid_val = val // BLOCK_Q + mid if use_q_block_mode else val
+
+        if mid_val <= target_idx:
+            left = mid + 1
+        else:
+            right = mid
+
+    return left - 1
+
+
+# not as lambda, for python3.9
+def fallback_heuristic_dt2(key):
+    tpa_test_q = key[1]
+    tpa_test_k = key[2]
+    # Model trained on max
+    if tpa_test_q < 1024:
+        BLOCK_M = 16
+    else:
+        BLOCK_M = 64
+
+    if tpa_test_k < 64:
+        if tpa_test_k < 32:
+            BLOCK_N = 16
+        else:
+            BLOCK_N = 32
+    else:
+        if tpa_test_q < 256:
+            BLOCK_N = 128
+        else:
+            BLOCK_N = 64
+    ret = triton.Config(
+        {"BLOCK_M": BLOCK_M, "BLOCK_N": BLOCK_N}, num_stages=2, num_warps=8
+    )
+    # num stages = 2, to be on the safe side for MI300
+    return ret
+
+
+def informed_fallback_next(key, cache):
+    # key[0] = max q
+    # key[2] = avg q
+    ret = cache[min(cache.keys(), key=lambda x: abs(x - key[0]))]
+    return ret
+
+
+def prepare_informed_fallback(cache):
+    ret = {int(k[0]): c for k, c in cache.items()}
+    return ret
+
+
+@functools.lru_cache
+def prefill_heuristics_2d(MAX_SEQ_Q, MAX_SEQ_K, AVG_SEQ_Q, AVG_SEQ_K):
+    gpu_name = torch.cuda.get_device_name()
+    # print(f"MAX_SEQ_Q {MAX_SEQ_Q}, MAX_SEQ_K {MAX_SEQ_K}, AVG_SEQ_Q {AVG_SEQ_Q}, AVG_SEQ_K {AVG_SEQ_K}")
+    if "NVIDIA H100" in gpu_name:
+        # # TPA original heuristic
+        # if MAX_SEQ_Q < 1024:
+        #     BLOCK_M = 16
+        # else:
+        #     BLOCK_M = 64
+        # if MAX_SEQ_K < 64:
+        #     if MAX_SEQ_K < 32:
+        #         BLOCK_N = 16
+        #     else:
+        #         BLOCK_N = 32
+        # else:
+        #     if MAX_SEQ_Q < 256:
+        #         BLOCK_N = 128
+        #     else:
+        #         BLOCK_N = 64
+        # config = {'num_stages': 3, 'num_warps': 4,
+        #           'BLOCK_N': BLOCK_N, 'BLOCK_M': BLOCK_M}
+        # dejavu with microbenchmarks
+        # TODO: update to latest tuning with AVG
+        if MAX_SEQ_K <= 96:
+            config = {"num_stages": 4, "num_warps": 4, "BLOCK_N": 32, "BLOCK_M": 16}
+        else:
+            if MAX_SEQ_Q <= 192:
+                if MAX_SEQ_K <= 1536:
+                    config = {
+                        "num_stages": 2,
+                        "num_warps": 8,
+                        "BLOCK_N": 128,
+                        "BLOCK_M": 16,
+                    }
+                else:
+                    config = {
+                        "num_stages": 8,
+                        "num_warps": 8,
+                        "BLOCK_N": 128,
+                        "BLOCK_M": 16,
+                    }
+            else:
+                config = {
+                    "num_stages": 1,
+                    "num_warps": 8,
+                    "BLOCK_N": 128,
+                    "BLOCK_M": 128,
+                }
+    elif "AMD Instinct MI300" in gpu_name:
+        # dejavu with microbenchmarks
+        # TODO: update to latest tuning with AVG
+        if MAX_SEQ_Q <= 384:
+            if MAX_SEQ_K <= 96:
+                config = {"num_stages": 4, "num_warps": 4, "BLOCK_N": 32, "BLOCK_M": 16}
+            else:
+                if MAX_SEQ_K <= 192:
+                    if MAX_SEQ_Q <= 96:
+                        config = {
+                            "num_stages": 2,
+                            "num_warps": 8,
+                            "BLOCK_N": 128,
+                            "BLOCK_M": 16,
+                        }
+                    else:
+                        config = {
+                            "num_stages": 4,
+                            "num_warps": 4,
+                            "BLOCK_N": 32,
+                            "BLOCK_M": 16,
+                        }
+                else:
+                    if MAX_SEQ_Q <= 128:
+                        config = {
+                            "num_stages": 4,
+                            "num_warps": 4,
+                            "BLOCK_N": 32,
+                            "BLOCK_M": 16,
+                        }
+                    else:
+                        if MAX_SEQ_K <= 384:
+                            config = {
+                                "num_stages": 4,
+                                "num_warps": 4,
+                                "BLOCK_N": 32,
+                                "BLOCK_M": 16,
+                            }
+                        else:
+                            config = {
+                                "num_stages": 1,
+                                "num_warps": 4,
+                                "BLOCK_N": 256,
+                                "BLOCK_M": 32,
+                            }
+        else:
+            if MAX_SEQ_K <= 768:
+                config = {"num_stages": 4, "num_warps": 4, "BLOCK_N": 16, "BLOCK_M": 64}
+            else:
+                config = {"num_stages": 1, "num_warps": 2, "BLOCK_N": 64, "BLOCK_M": 64}
+    else:
+        # default
+        config = {
+            "BLOCK_M": 64 if MAX_SEQ_Q > 1 and AVG_SEQ_Q >= 4096 else 16,
+            "BLOCK_N": 16 if MAX_SEQ_K < 128 and AVG_SEQ_Q <= 4096 else 64,
+            "num_warps": 4,
+            "num_stages": 3,
+        }
+    # print(config)
+    return config
+
+
+# @triton_dejavu.jitcache(
+#     # this list is shorter, since it will be called only within one model
+#     check_keys=[
+#         "MAX_SEQ_Q",
+#         "MAX_SEQ_K",
+#         "AVG_SEQ_Q",
+#         "AVG_SEQ_K",
+#         "stride_k_cache_3",
+#         "stride_v_cache_3",
+#     ],
+#     check_specialization=["num_seqs"],
+#     assume_const=[
+#         "scale",
+#         "k_scale",
+#         "v_scale",
+#         "query_stride_1",
+#         "output_stride_1",
+#         "stride_k_cache_0",
+#         "stride_k_cache_1",
+#         "stride_k_cache_2",
+#         "stride_k_cache_4",
+#         "stride_v_cache_0",
+#         "stride_v_cache_1",
+#         "stride_v_cache_2",
+#     ],
+#     autotuner_args=["BLOCK_N", "BLOCK_M"],
+# )
+@triton_dejavu.autotune(
+    config_space=triton_dejavu.ConfigSpace(
+        {
+            "BLOCK_N": [16, 32, 64, 128, 256, 512],
+            "BLOCK_M": [16, 32, 64, 128, 256, 512],
+        },
+        num_warps=[2, 4, 8],
+        num_stages=[1, 2, 4, 6, 8],
+    ),
+    # this list is longer, since it would be used for multiple models
+    key=[
+        "MAX_SEQ_Q",
+        "MAX_SEQ_K",
+        "AVG_SEQ_Q",
+        "AVG_SEQ_K",
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3",
+    ],
+    custom_data_storage=os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "dejavu_data")
+    ),
+    use_cuda_graph=True,
+    use_bo=True,
+    search_max_search_t=360,
+    informed_fallback=informed_fallback_next,
+    prepare_informed_fallback=prepare_informed_fallback,
+    fallback_heuristic=fallback_heuristic_dt2,
+    ignore_dtypes=True,
+)
+# @triton.heuristics(
+#        {
+#            "BLOCK_M": lambda args: prefill_heuristics_2d(args['MAX_SEQ_Q'], args['MAX_SEQ_K'], args['AVG_SEQ_Q'], args['AVG_SEQ_K'])['BLOCK_M'],
+#            "BLOCK_N": lambda args: prefill_heuristics_2d(args['MAX_SEQ_Q'], args['MAX_SEQ_K'], args['AVG_SEQ_Q'], args['AVG_SEQ_K'])['BLOCK_N'],
+#            "num_warps": lambda args: prefill_heuristics_2d(args['MAX_SEQ_Q'], args['MAX_SEQ_K'], args['AVG_SEQ_Q'], args['AVG_SEQ_K'])['num_warps'],
+#            "num_stages": lambda args: prefill_heuristics_2d(args['MAX_SEQ_Q'], args['MAX_SEQ_K'], args['AVG_SEQ_Q'], args['AVG_SEQ_K'])['num_stages'],
+#         }
+# )
+@triton.jit
+def kernel_unified_attention_2d(
+    output_ptr,  # [num_tokens, num_query_heads, head_size]
+    query_ptr,  # [num_tokens, num_query_heads, head_size]
+    key_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+    value_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+    block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+    seq_lens_ptr,  # [num_seqs]
+    alibi_slopes_ptr,  # [num_query_heads]
+    scale,  # float32
+    k_scale,  # float32
+    v_scale,  # float32
+    softcap,  # float32
+    num_query_heads: tl.constexpr,  # int
+    num_queries_per_kv: tl.constexpr,  # int
+    block_table_stride: tl.int64,  # int
+    query_stride_0: tl.int64,  # int
+    query_stride_1: tl.int64,  # int, should be equal to head_size
+    output_stride_0: tl.int64,  # int
+    output_stride_1: tl.int64,  # int, should be equal to head_size
+    BLOCK_SIZE: tl.constexpr,  # int
+    HEAD_SIZE: tl.constexpr,  # int
+    HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+    USE_ALIBI_SLOPES: tl.constexpr,  # bool
+    USE_SOFTCAP: tl.constexpr,  # bool
+    SLIDING_WINDOW: tl.constexpr,  # int
+    stride_k_cache_0: tl.int64,  # int
+    stride_k_cache_1: tl.int64,  # int
+    stride_k_cache_2: tl.int64,  # int
+    stride_k_cache_3: tl.constexpr,  # int
+    stride_v_cache_0: tl.int64,  # int
+    stride_v_cache_1: tl.int64,  # int
+    stride_v_cache_2: tl.int64,  # int
+    stride_v_cache_3: tl.constexpr,  # int
+    query_start_len_ptr,  # [num_seqs+1]
+    num_seqs: tl.int32,
+    # used as input to the autotuner/heuristics
+    MAX_SEQ_Q: tl.constexpr,
+    MAX_SEQ_K: tl.constexpr,
+    AVG_SEQ_Q: tl.constexpr,
+    AVG_SEQ_K: tl.constexpr,
+    # autotuner args
+    BLOCK_M: tl.constexpr,  # int
+    BLOCK_N: tl.constexpr,  # int
+):
+
+    q_block_global_idx = tl.program_id(0)
+    kv_head_idx = tl.program_id(1)
+    BLOCK_Q = BLOCK_M // num_queries_per_kv
+
+    seq_idx = find_seq_idx(
+        query_start_len_ptr, q_block_global_idx, num_seqs, BLOCK_Q, True
+    )
+
+    q_block_start_idx = tl.load(query_start_len_ptr + seq_idx) // BLOCK_Q + seq_idx
+
+    q_block_local_idx = q_block_global_idx - q_block_start_idx
+
+    cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
+    cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1)
+
+    cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index
+
+    if q_block_local_idx * BLOCK_Q >= cur_batch_query_len:
+        return
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, HEAD_SIZE_PADDED)
+    query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
+
+    query_offset_0 = cur_batch_in_all_start_index + query_pos
+    query_offset_1 = kv_head_idx * num_queries_per_kv + offs_m % num_queries_per_kv
+    query_offset = (
+        query_offset_0[:, None] * query_stride_0
+        + query_offset_1[:, None] * query_stride_1
+        + offs_d[None, :]
+    )
+
+    dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1)
+    query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1)
+    query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1)
+
+    # Q : (BLOCK_M, HEAD_SIZE_PADDED)
+    Q = tl.load(
+        query_ptr + query_offset,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+        other=0.0,
+    )
+
+    block_table_offset = seq_idx * block_table_stride
+
+    M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    L = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32)
+
+    # sequence len for this particular sequence
+    seq_len = tl.load(seq_lens_ptr + seq_idx)
+
+    # context length for this particular sequences
+    context_len = seq_len - cur_batch_query_len
+
+    # alibi slope for this head
+    if USE_ALIBI_SLOPES:
+        alibi_slope = tl.load(
+            alibi_slopes_ptr + query_offset_1, mask=query_mask_1, other=0.0
+        )
+
+    # compute the length of the longest sequence prefix spanned by any
+    # query token in the current q_block (q_block_local_idx)
+    max_seq_prefix_len = (
+        context_len
+        + q_block_local_idx * BLOCK_Q
+        + (BLOCK_M - 1) // num_queries_per_kv
+        + 1
+    )
+
+    # adjust for potential padding in the last q_block by considering the
+    # actual sequence length
+    max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
+
+    offs_n = tl.arange(0, BLOCK_N)
+
+    # iterate through tiles (below the mask)
+    # The loop iterates only until the longest sequence. Due to causal
+    # masking, blocks beyond this prefix can be skipped.
+    for start_n in range(0, max_seq_prefix_len, BLOCK_N):
+
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+
+        physical_block_idx = tl.load(
+            block_tables_ptr + block_table_offset + (start_n + offs_n) // BLOCK_SIZE,
+            mask=(start_n + offs_n) < seq_len,
+            other=0,
+        )
+
+        v_offset = (
+            physical_block_idx[:, None] * stride_v_cache_0
+            + kv_head_idx * stride_v_cache_2
+            + offs_d[None, :] * stride_v_cache_3
+            + (offs_n[:, None] % BLOCK_SIZE) * stride_v_cache_1
+        )
+
+        k_offset = (
+            physical_block_idx[None, :] * stride_k_cache_0
+            + kv_head_idx * stride_k_cache_2
+            + offs_d[:, None] * stride_k_cache_3
+            + (offs_n[None, :] % BLOCK_SIZE) * stride_k_cache_1
+        )
+
+        seq_offset_load = start_n + offs_n
+        load_mask = seq_offset_load < max_seq_prefix_len
+
+        # K : (HEAD_SIZE_PADDED, BLOCK_N)
+        K_load = tl.load(
+            key_cache_ptr + k_offset,
+            mask=dim_mask[:, None] & load_mask[None, :],
+            other=0.0,
+        )
+
+        if K_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                K = K_load
+            else:
+                K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype)
+        else:
+            K = K_load
+
+        # V : (BLOCK_N, HEAD_SIZE_PADDED)
+        V_load = tl.load(
+            value_cache_ptr + v_offset,
+            mask=dim_mask[None, :] & load_mask[:, None],
+            other=0.0,
+        )
+
+        if V_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                V = V_load
+            else:
+                V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype)
+        else:
+            V = V_load
+
+        seq_offset = start_n + tl.arange(0, BLOCK_N)
+
+        # seq_mask: (BLOCK_M, BLOCK_N)
+        seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
+
+        # S : (BLOCK_M, BLOCK_N)
+        S = tl.zeros(shape=(BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+        S += scale * tl.dot(Q, K)
+
+        if USE_SOFTCAP:
+            S = apply_softcap(S, softcap)
+
+        S = tl.where(
+            query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf")
+        )
+
+        if SLIDING_WINDOW > 0:
+            S = tl.where(
+                (context_len + query_pos[:, None] - seq_offset) < SLIDING_WINDOW,
+                S,
+                float("-inf"),
+            )
+
+        if USE_ALIBI_SLOPES:
+            S += alibi_slope[:, None] * (seq_offset - context_len)
+
+        # compute running maximum
+        # m_j : (BLOCK_M,)
+        m_j = tl.maximum(M, tl.max(S, axis=1))
+        # For sliding window there's a chance the max is -inf due to masking of
+        # the entire row. In this case we need to set m_j 0 to avoid NaN
+        m_j = tl.where(m_j > float("-inf"), m_j, 0.0)
+
+        # P : (BLOCK_M, BLOCK_N)
+        P = tl.exp(S - m_j[:, None])
+
+        # l_j : (BLOCK_M,)
+        l_j = tl.sum(P, axis=1)
+
+        # alpha : (BLOCK_M, )
+        alpha = tl.exp(M - m_j)
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc = acc * alpha[:, None]
+
+        # update constants
+        L = L * alpha + l_j
+        M = m_j
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc += tl.dot(P.to(V.dtype), V)
+
+    # epilogue
+    acc = acc / L[:, None]
+
+    output_offset = (
+        query_offset_0[:, None] * output_stride_0
+        + query_offset_1[:, None] * output_stride_1
+        + offs_d[None, :]
+    )
+
+    tl.store(
+        output_ptr + output_offset,
+        acc,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+    )
+
+
+@triton.jit
+def kernel_unified_attention_3d(
+    segm_output_ptr,
+    # [num_tokens, num_query_heads, num_segments, head_size]
+    segm_max_ptr,  # [num_tokens, num_query_heads, num_segments]
+    segm_expsum_ptr,  # [num_tokens, num_query_heads, num_segments]
+    query_ptr,  # [num_tokens, num_query_heads, head_size]
+    key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
+    value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
+    block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+    seq_lens_ptr,  # [num_seqs]
+    alibi_slopes_ptr,  # [num_query_heads]
+    scale,  # float32
+    k_scale,  # float32
+    v_scale,  # float32
+    softcap,  # float32
+    num_query_heads: tl.constexpr,  # int
+    num_queries_per_kv: tl.constexpr,  # int
+    block_table_stride: tl.int64,  # int
+    query_stride_0: tl.int64,  # int
+    query_stride_1: tl.int64,  # int, should be equal to head_size
+    BLOCK_SIZE: tl.constexpr,  # int
+    HEAD_SIZE: tl.constexpr,  # int
+    HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+    USE_ALIBI_SLOPES: tl.constexpr,  # bool
+    USE_SOFTCAP: tl.constexpr,  # bool
+    SLIDING_WINDOW: tl.constexpr,  # int
+    stride_k_cache_0: tl.int64,  # int
+    stride_k_cache_1: tl.int64,  # int
+    stride_k_cache_2: tl.int64,  # int
+    stride_k_cache_3: tl.constexpr,  # int
+    stride_v_cache_0: tl.int64,  # int
+    stride_v_cache_1: tl.int64,  # int
+    stride_v_cache_2: tl.int64,  # int
+    stride_v_cache_3: tl.constexpr,  # int
+    query_start_len_ptr,  # [num_seqs+1]
+    BLOCK_Q: tl.constexpr,  # int
+    num_seqs: tl.int32,
+    BLOCK_M: tl.constexpr,  # int
+    NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+):
+    q_block_global_idx = tl.program_id(0)
+    kv_head_idx = tl.program_id(1)
+    segm_idx = tl.program_id(2)
+
+    seq_idx = find_seq_idx(
+        query_start_len_ptr, q_block_global_idx, num_seqs, BLOCK_Q, True
+    )
+
+    q_block_start_idx = tl.load(query_start_len_ptr + seq_idx) // BLOCK_Q + seq_idx
+
+    q_block_local_idx = q_block_global_idx - q_block_start_idx
+
+    cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
+    cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1)
+
+    cur_batch_query_len = cur_batch_in_all_stop_index - cur_batch_in_all_start_index
+
+    if q_block_local_idx * BLOCK_Q >= cur_batch_query_len:
+        return
+
+    # sequence len for this particular sequence
+    seq_len = tl.load(seq_lens_ptr + seq_idx)
+
+    # number of segments for this particular sequence
+    num_segments = NUM_SEGMENTS_PER_SEQ
+    blocks_per_segment = cdiv_fn(seq_len, num_segments * BLOCK_SIZE)
+
+    if segm_idx * blocks_per_segment * BLOCK_SIZE >= seq_len:
+        return
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, HEAD_SIZE_PADDED)
+
+    query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
+
+    query_offset_0 = cur_batch_in_all_start_index + query_pos
+    query_offset_1 = kv_head_idx * num_queries_per_kv + offs_m % num_queries_per_kv
+
+    query_offset = (
+        query_offset_0[:, None] * query_stride_0
+        + query_offset_1[:, None] * query_stride_1
+        + offs_d[None, :]
+    )
+
+    dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1)
+    query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1)
+    query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1)
+
+    # Q : (BLOCK_M, HEAD_SIZE_PADDED)
+    Q = tl.load(
+        query_ptr + query_offset,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+        other=0.0,
+    )
+
+    block_table_offset = seq_idx * block_table_stride
+
+    M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    L = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32)
+
+    # context length for this particular sequences
+    context_len = seq_len - cur_batch_query_len
+
+    # alibi slope for this head
+    if USE_ALIBI_SLOPES:
+        alibi_slope = tl.load(
+            alibi_slopes_ptr + query_offset_1, mask=query_mask_1, other=0.0
+        )
+
+    num_blocks = cdiv_fn(seq_len, BLOCK_SIZE)
+
+    # iterate through tiles within current segment
+    for j in range(
+        segm_idx * blocks_per_segment,
+        min((segm_idx + 1) * blocks_per_segment, num_blocks),
+    ):
+        physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j)
+
+        offs_n = tl.arange(0, BLOCK_SIZE)
+
+        v_offset = (
+            physical_block_idx * stride_v_cache_0
+            + kv_head_idx * stride_v_cache_2
+            + offs_d[None, :] * stride_v_cache_3
+            + offs_n[:, None] * stride_v_cache_1
+        )
+
+        k_offset = (
+            physical_block_idx * stride_k_cache_0
+            + kv_head_idx * stride_k_cache_2
+            + offs_d[:, None] * stride_k_cache_3
+            + offs_n[None, :] * stride_k_cache_1
+        )
+
+        # K : (HEAD_SIZE, BLOCK_SIZE)
+        K_load = tl.load(key_cache_ptr + k_offset, mask=dim_mask[:, None], other=0.0)
+
+        if K_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                K = K_load
+            else:
+                K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype)
+        else:
+            K = K_load
+
+        # V : (BLOCK_SIZE, HEAD_SIZE)
+        V_load = tl.load(value_cache_ptr + v_offset, mask=dim_mask[None, :], other=0.0)
+
+        if V_load.dtype.is_fp8():
+            if Q.dtype.is_fp8():
+                V = V_load
+            else:
+                V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype)
+        else:
+            V = V_load
+
+        seq_offset = j * BLOCK_SIZE + offs_n
+
+        seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
+
+        # S : (BLOCK_M, BLOCK_SIZE)
+        S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32)
+
+        S += scale * tl.dot(Q, K)
+
+        if USE_SOFTCAP:
+            S = apply_softcap(S, softcap)
+
+        S = tl.where(
+            query_mask_1[:, None] & query_mask_0[:, None] & seq_mask, S, float("-inf")
+        )
+
+        if SLIDING_WINDOW > 0:
+            S = tl.where(
+                (context_len + query_pos[:, None] - seq_offset) < SLIDING_WINDOW,
+                S,
+                float("-inf"),
+            )
+
+        if USE_ALIBI_SLOPES:
+            S += alibi_slope[:, None] * (seq_offset - context_len)
+
+        # compute running maximum
+        # m_j : (BLOCK_M,)
+        m_j = tl.maximum(M, tl.max(S, axis=1))
+        # For sliding window there's a chance the max is -inf due to masking of
+        # the entire row. In this case we need to set m_j 0 to avoid NaN
+        m_j = tl.where(m_j > float("-inf"), m_j, 0.0)
+
+        # P : (BLOCK_M, BLOCK_SIZE,)
+        P = tl.exp(S - m_j[:, None])
+
+        # l_j : (BLOCK_M,)
+        l_j = tl.sum(P, axis=1)
+
+        # alpha : (BLOCK_M, )
+        alpha = tl.exp(M - m_j)
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc = acc * alpha[:, None]
+
+        # update constants
+        L = L * alpha + l_j
+        M = m_j
+
+        # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+        acc += tl.dot(P.to(V.dtype), V)
+
+    segm_output_offset = (
+        query_offset_0[:, None].to(tl.int64)
+        * (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED)
+        + query_offset_1[:, None] * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED)
+        + segm_idx * HEAD_SIZE_PADDED
+        + tl.arange(0, HEAD_SIZE_PADDED)[None, :]
+    )
+    tl.store(
+        segm_output_ptr + segm_output_offset,
+        acc,
+        mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+    )
+    segm_offset = (
+        query_offset_0.to(tl.int64) * (num_query_heads * NUM_SEGMENTS_PER_SEQ)
+        + query_offset_1 * NUM_SEGMENTS_PER_SEQ
+        + segm_idx
+    )
+    tl.store(segm_max_ptr + segm_offset, M, mask=query_mask_0 & query_mask_1)
+    tl.store(segm_expsum_ptr + segm_offset, L, mask=query_mask_0 & query_mask_1)
+
+
+@triton.jit
+def reduce_segments(
+    output_ptr,  # [num_tokens, num_query_heads, head_size]
+    segm_output_ptr,
+    # [num_tokens, num_query_heads, max_num_segments, head_size]
+    segm_max_ptr,  # [num_tokens, num_query_heads, max_num_segments]
+    segm_expsum_ptr,  # [num_tokens, num_query_heads, max_num_segments]
+    seq_lens_ptr,  # [num_seqs]
+    num_seqs,  # int
+    num_query_heads: tl.constexpr,  # int
+    output_stride_0: tl.int64,  # int
+    output_stride_1: tl.int64,  # int, should be equal to head_size
+    block_table_stride: tl.int64,  # int
+    BLOCK_SIZE: tl.constexpr,  # int
+    HEAD_SIZE: tl.constexpr,  # int, must be power of 2
+    HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+    query_start_len_ptr,  # [num_seqs+1]
+    BLOCK_Q: tl.constexpr,  # int
+    NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+):
+    query_token_idx = tl.program_id(0)
+    query_head_idx = tl.program_id(1)
+
+    seq_idx = find_seq_idx(
+        query_start_len_ptr, query_token_idx, num_seqs, BLOCK_Q, False
+    )
+
+    # sequence len for this particular sequence
+    seq_len = tl.load(seq_lens_ptr + seq_idx)
+
+    # number of segments for this particular sequence
+    num_segments = NUM_SEGMENTS_PER_SEQ
+    blocks_per_segment = cdiv_fn(seq_len, num_segments * BLOCK_SIZE)
+
+    # create masks for subsequent loads
+    act_num_segments = cdiv_fn(seq_len, blocks_per_segment * BLOCK_SIZE)
+    segm_mask = tl.arange(0, NUM_SEGMENTS_PER_SEQ) < tl.full(
+        [NUM_SEGMENTS_PER_SEQ], act_num_segments, dtype=tl.int32
+    )
+    dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1, 0).to(tl.int1)
+
+    # load segment maxima
+    segm_offset = (
+        query_token_idx.to(tl.int64) * (num_query_heads * NUM_SEGMENTS_PER_SEQ)
+        + query_head_idx * NUM_SEGMENTS_PER_SEQ
+        + tl.arange(0, NUM_SEGMENTS_PER_SEQ)
+    )
+    segm_max = tl.load(segm_max_ptr + segm_offset, mask=segm_mask, other=float("-inf"))
+    overall_max = tl.max(segm_max)
+
+    # load and rescale segment exp sums
+    segm_expsum = tl.load(segm_expsum_ptr + segm_offset, mask=segm_mask, other=0.0)
+    segm_expsum = segm_expsum * tl.exp(segm_max - overall_max)
+    overall_expsum = tl.sum(segm_expsum)
+
+    # load, rescale, and add segment attention outputs
+    segm_output_offset = (
+        query_token_idx.to(tl.int64)
+        * (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED)
+        + query_head_idx * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED)
+        + tl.arange(0, NUM_SEGMENTS_PER_SEQ)[:, None] * HEAD_SIZE_PADDED
+        + tl.arange(0, HEAD_SIZE_PADDED)[None, :]
+    )
+    segm_output = tl.load(
+        segm_output_ptr + segm_output_offset,
+        mask=segm_mask[:, None] & dim_mask[None, :],
+        other=0.0,
+    )
+    segm_output *= tl.exp(segm_max - overall_max)[:, None]
+    acc_sum = tl.sum(segm_output, axis=0)
+    # safely divide by overall_expsum, returning 0.0 if overall_expsum is 0
+    acc = tl.where(overall_expsum == 0.0, 0.0, acc_sum / overall_expsum)
+
+    # write result
+    output_offset = (
+        query_token_idx * output_stride_0
+        + query_head_idx * output_stride_1
+        + tl.arange(0, HEAD_SIZE_PADDED)
+    )
+    tl.store(output_ptr + output_offset, acc, mask=dim_mask)
+
+
+def unified_attention(
+    q,
+    k,
+    v,
+    out,
+    cu_seqlens_q,
+    max_seqlen_q,
+    seqused_k,
+    max_seqlen_k,
+    avg_seqlen_q,
+    avg_seqlen_k,
+    softmax_scale,
+    causal,
+    window_size,
+    block_table,
+    softcap,
+    q_descale,
+    k_descale,
+    v_descale,
+    alibi_slopes=None,
+    force_selection=None,  # None, 2, 3 to select kernel
+):
+    assert causal, "Only causal attention is supported"
+    assert q_descale is None, "Q scales not supported"
+
+    block_size = v.shape[1]
+    assert (
+        q.element_size() >= 2 or block_size >= 32
+    ), "Block size must be at least 32 for fp8"
+
+    use_alibi_slopes = alibi_slopes is not None
+
+    block_size = v.shape[1]
+    num_seqs = len(seqused_k)
+    num_query_heads = q.shape[1]
+    num_kv_heads = k.shape[2]
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    head_size = q.shape[2]
+
+    MAX_SEQ_Q = triton.next_power_of_2(int(max_seqlen_q))
+    MAX_SEQ_K = triton.next_power_of_2(int(max_seqlen_k))
+    AVG_SEQ_Q = triton.next_power_of_2(int(avg_seqlen_q))
+    AVG_SEQ_K = triton.next_power_of_2(int(avg_seqlen_k))
+
+    # if batch contains a prefill
+    if (max_seqlen_q > 1 or force_selection == 2) and force_selection != 3:
+
+        grid = lambda META: (
+            q.shape[0] // (META["BLOCK_M"] // num_queries_per_kv) + num_seqs,
+            num_kv_heads,
+        )
+
+        kernel_unified_attention_2d[grid](
+            output_ptr=out,
+            query_ptr=q,
+            key_cache_ptr=k,
+            value_cache_ptr=v,
+            block_tables_ptr=block_table,
+            seq_lens_ptr=seqused_k,
+            alibi_slopes_ptr=alibi_slopes,
+            scale=softmax_scale,
+            k_scale=k_descale,
+            v_scale=v_descale,
+            softcap=softcap,
+            num_query_heads=num_query_heads,
+            num_queries_per_kv=num_queries_per_kv,
+            block_table_stride=block_table.stride(0),
+            query_stride_0=q.stride(0),
+            query_stride_1=q.stride(1),
+            output_stride_0=out.stride(0),
+            output_stride_1=out.stride(1),
+            BLOCK_SIZE=block_size,
+            HEAD_SIZE=head_size,
+            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+            USE_ALIBI_SLOPES=use_alibi_slopes,
+            USE_SOFTCAP=(softcap > 0),
+            SLIDING_WINDOW=(1 + window_size[0]),
+            stride_k_cache_0=k.stride(0),
+            stride_k_cache_1=k.stride(1),
+            stride_k_cache_2=k.stride(2),
+            stride_k_cache_3=k.stride(3),
+            stride_v_cache_0=v.stride(0),
+            stride_v_cache_1=v.stride(1),
+            stride_v_cache_2=v.stride(2),
+            stride_v_cache_3=v.stride(3),
+            query_start_len_ptr=cu_seqlens_q,
+            num_seqs=num_seqs,
+            MAX_SEQ_Q=MAX_SEQ_Q,
+            MAX_SEQ_K=MAX_SEQ_K,
+            AVG_SEQ_Q=AVG_SEQ_Q,
+            AVG_SEQ_K=AVG_SEQ_K,
+        )
+    else:
+        BLOCK_M = 64 if max_seqlen_q > 1 and avg_seqlen_q >= 4096 else 16
+        BLOCK_Q = BLOCK_M // num_queries_per_kv
+
+        # Ideally we would launch with kernel with:
+        # \sum_i[ceil(query_len[i] / BLOCK_Q)] blocks.
+        # However, it is slow to realize the query_lens on cpu.
+        # Instead we use upper-bound:
+        # \sum_i[ceil(query_len[i] / BLOCK_Q)]
+        #   <= \sum_i[floor(query_len[i] / BLOCK_Q) + 1]
+        #    = \sum_i[floor(query_len[i] / BLOCK_Q)] + num_seqs
+        #   <= floor(\sum_i(query_len[i]) / BLOCK_Q) + num_seqs
+        #    = floor(q.shape[0] / BLOCK_Q) + num_seqs
+        total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs
+
+        # for initial version, NUM_SEGMENTS = 16 is chosen as a default
+        # value that showed good performance in tests
+        NUM_SEGMENTS = 16
+
+        segm_output = torch.empty(
+            q.shape[0],
+            num_query_heads,
+            NUM_SEGMENTS,
+            triton.next_power_of_2(head_size),
+            dtype=torch.float32,
+            device=q.device,
+        )
+        segm_max = torch.empty(
+            q.shape[0],
+            num_query_heads,
+            NUM_SEGMENTS,
+            dtype=torch.float32,
+            device=q.device,
+        )
+        segm_expsum = torch.empty(
+            q.shape[0],
+            num_query_heads,
+            NUM_SEGMENTS,
+            dtype=torch.float32,
+            device=q.device,
+        )
+
+        kernel_unified_attention_3d[(total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)](
+            segm_output_ptr=segm_output,
+            segm_max_ptr=segm_max,
+            segm_expsum_ptr=segm_expsum,
+            query_ptr=q,
+            key_cache_ptr=k,
+            value_cache_ptr=v,
+            block_tables_ptr=block_table,
+            seq_lens_ptr=seqused_k,
+            alibi_slopes_ptr=alibi_slopes,
+            scale=softmax_scale,
+            k_scale=k_descale,
+            v_scale=v_descale,
+            softcap=softcap,
+            num_query_heads=num_query_heads,
+            num_queries_per_kv=num_queries_per_kv,
+            block_table_stride=block_table.stride(0),
+            query_stride_0=q.stride(0),
+            query_stride_1=q.stride(1),
+            BLOCK_SIZE=block_size,
+            HEAD_SIZE=head_size,
+            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+            USE_ALIBI_SLOPES=use_alibi_slopes,
+            USE_SOFTCAP=(softcap > 0),
+            SLIDING_WINDOW=(1 + window_size[0]),
+            stride_k_cache_0=k.stride(0),
+            stride_k_cache_1=k.stride(1),
+            stride_k_cache_2=k.stride(2),
+            stride_k_cache_3=k.stride(3),
+            stride_v_cache_0=v.stride(0),
+            stride_v_cache_1=v.stride(1),
+            stride_v_cache_2=v.stride(2),
+            stride_v_cache_3=v.stride(3),
+            query_start_len_ptr=cu_seqlens_q,
+            BLOCK_Q=BLOCK_Q,
+            num_seqs=num_seqs,
+            BLOCK_M=BLOCK_M,
+            NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+        )
+
+        reduce_segments[(q.shape[0], num_query_heads)](
+            output_ptr=out,
+            segm_output_ptr=segm_output,
+            segm_max_ptr=segm_max,
+            segm_expsum_ptr=segm_expsum,
+            seq_lens_ptr=seqused_k,
+            num_seqs=num_seqs,
+            num_query_heads=num_query_heads,
+            output_stride_0=out.stride(0),
+            output_stride_1=out.stride(1),
+            block_table_stride=block_table.stride(0),
+            BLOCK_SIZE=block_size,
+            HEAD_SIZE=head_size,
+            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+            query_start_len_ptr=cu_seqlens_q,
+            BLOCK_Q=BLOCK_Q,
+            NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+        )
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_newtiles.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_newtiles.py
index 30d80ad34..7c81b4796 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_newtiles.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_newtiles.py
@@ -640,7 +640,8 @@ def unified_attention(
     TILE_SIZE_DECODE = 32
 
     # if batch contains a prefill
-    if (max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128) or force_selection == 2 and force_selection != 3:
+    # if (max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128) or force_selection == 2 and force_selection != 3:
+    if force_selection == 2:
         kernel_unified_attention_2d[(
             total_num_q_blocks,
             num_kv_heads,
@@ -683,7 +684,7 @@ def unified_attention(
             num_seqs=num_seqs,
             BLOCK_M=BLOCK_M,
         )
-    else:
+    elif force_selection == 3:
         # for initial version, NUM_SEGMENTS = 16 is chosen as a default
         # value that showed good performance in tests
         NUM_SEGMENTS = 16
@@ -769,4 +770,6 @@ def unified_attention(
             query_start_len_ptr=cu_seqlens_q,
             BLOCK_Q=BLOCK_Q,
             NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
-        )
\ No newline at end of file
+        )
+    else:
+        raise RuntimeError("currently, we need to force a kernel selection")
\ No newline at end of file
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index d34cc0039..3e0de97be 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -1046,10 +1046,10 @@ def test_prefix_vllm_v1_attention(
         Implementation.UNF_TRITON_3D,
         Implementation.UNF_TRITON_2D,
         Implementation.UNF_TRITON_2D_SIMPLE,
-        Implementation.UNF_TRITON_AUTO,
+        # Implementation.UNF_TRITON_AUTO,
         Implementation.NT_UNF_TRITON_3D,
         Implementation.NT_UNF_TRITON_2D,
-        Implementation.NT_UNF_TRITON_AUTO,
+        # Implementation.NT_UNF_TRITON_AUTO,
     ]:
         pytest.skip()
 
diff --git a/scripts/callers/unified_triton.py b/scripts/callers/unified_triton.py
index 9dd00fe57..3bc3aa5aa 100644
--- a/scripts/callers/unified_triton.py
+++ b/scripts/callers/unified_triton.py
@@ -76,8 +76,8 @@ def call_and_process_output():
                 k_descale=None,  # TODO?
                 v_descale=None,  # TODO?
                 alibi_slopes=None,
-                avg_seqlen_q=avg_seqlen_q,
-                avg_seqlen_k=avg_seqlen_k,
+                # avg_seqlen_q=avg_seqlen_q,
+                # avg_seqlen_k=avg_seqlen_k,
                 force_selection=force_selection,
             )
 
diff --git a/scripts/setups/prefix_tune_2d.conf b/scripts/setups/prefix_tune_2d.conf
index 2dd6b8a51..b532bc9af 100644
--- a/scripts/setups/prefix_tune_2d.conf
+++ b/scripts/setups/prefix_tune_2d.conf
@@ -11,8 +11,8 @@ PREFIX_PREFILL_SHARE_OF_DECODE = [0.0, 0.5, 1.0]
 PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0, 0.5]
 # PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.5]
 # PREFIX_PREFILL_BATCH_COMPOSITION = ["ALTERNATING"]
-PREFIX_PREFILL_BATCH_COMPOSITION = ["DEC_PRE"]
-# PREFIX_PREFILL_BATCH_COMPOSITION = ["DEC_PRE", "ALTERNATING"]
+# PREFIX_PREFILL_BATCH_COMPOSITION = ["DEC_PRE"]
+PREFIX_PREFILL_BATCH_COMPOSITION = ["DEC_PRE", "ALTERNATING"]
 
 HEAD_SIZES = [128]  # only powers of 2! for llama2 & 3
 # head_size * head_numbers = hidden_size
@@ -24,12 +24,14 @@ PROMPT_PATTERNS = [[1.0], [0.1, 0.4, 0.5, 1.0, 0.2]]
 # PROMPT_PATTERNS = [[1.0]]
 
 MAX_VALUES = [1.0]
-BENCHMARK_MODES = ["CUDA_EVENTS"]
+# BENCHMARK_MODES = ["CUDA_EVENTS"]
+BENCHMARK_MODES = ["CUDA_GRAPHS"]
 
 # IMPLEMENTATION_UT = ["UNF_TRITON_2D"]
 # IMPLEMENTATION_UT = ["UNF_TRITON_2D_SIMPLE"]
 # IMPLEMENTATION_UT = ["FLASH_ATTN", "UNF_TRITON_2D"]
-IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D"]
+IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D", "FLASH_ATTN", "UNF_TRITON_2D", "UNF_TRITON_3D"]
+# IMPLEMENTATION_UT = ["UNF_TRITON_3D"]
 
 # TRITON_BACKEND_DEBUG = 1
 # STORE_TEST_RESULT_PATH=/results

From a2f635e08bd83a2386126e7d3e325015434b28b9 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 30 Jul 2025 07:42:21 -0400
Subject: [PATCH 28/61] some fixes

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/bench_vllm_user_range.py   | 5 ++++-
 scripts/benchmark.py               | 1 +
 scripts/setups/prefix_tune_2d.conf | 3 ++-
 vllm                               | 2 +-
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/scripts/bench_vllm_user_range.py b/scripts/bench_vllm_user_range.py
index 7674450f9..435b62686 100644
--- a/scripts/bench_vllm_user_range.py
+++ b/scripts/bench_vllm_user_range.py
@@ -61,7 +61,10 @@ def create_dir_if_not_exist(path, mode=0o777):
 # result_dir = (
 #     f"/results/{model.replace('/','-')}/{gpu_name}/{testcase_name}/exp_{timestamp_f}/"
 # )
-result_dir = f"{result_path}/{model.replace('/','-')}/{gpu_name}/{testcase_name}/exp_{timestamp_f}/"
+model_print_path = model.replace('/','-')
+if model_print_path[0:2] == './':
+    model_print_path = model_print_path[2:]
+result_dir = f"{result_path}/{model_print_path}/{gpu_name}/{testcase_name}/exp_{timestamp_f}/"
 
 bench_script = "/workspace/benchmarks/benchmark_serving.py"
 if not os.path.isfile(bench_script):
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 3e0de97be..1a7a5c84a 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -1454,6 +1454,7 @@ def test_prefix_vllm_v1_attention(
                 "dtype": dtype,
                 "max_value": max_value,
                 "realistic_prompt_mode": realistic_prompt_mode,
+                "batch_composition": batch_composition,
                 "gqa_mode": gqa_mode,
                 "prompt_pattern": prompt_pattern,
                 "implementation": implementation,
diff --git a/scripts/setups/prefix_tune_2d.conf b/scripts/setups/prefix_tune_2d.conf
index b532bc9af..1cd5a119b 100644
--- a/scripts/setups/prefix_tune_2d.conf
+++ b/scripts/setups/prefix_tune_2d.conf
@@ -30,7 +30,8 @@ BENCHMARK_MODES = ["CUDA_GRAPHS"]
 # IMPLEMENTATION_UT = ["UNF_TRITON_2D"]
 # IMPLEMENTATION_UT = ["UNF_TRITON_2D_SIMPLE"]
 # IMPLEMENTATION_UT = ["FLASH_ATTN", "UNF_TRITON_2D"]
-IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D", "FLASH_ATTN", "UNF_TRITON_2D", "UNF_TRITON_3D"]
+# IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D", "FLASH_ATTN", "UNF_TRITON_2D", "UNF_TRITON_3D"]
+IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D", "UNF_TRITON_2D", "UNF_TRITON_3D"]
 # IMPLEMENTATION_UT = ["UNF_TRITON_3D"]
 
 # TRITON_BACKEND_DEBUG = 1
diff --git a/vllm b/vllm
index aef0bfd6e..c3df1cb7b 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit aef0bfd6ec4602b057e9c968347ef100dc533ae3
+Subproject commit c3df1cb7bfc6d7305b7cf34a32ee2311feec883d

From e31fc91045ff5bfe1c275fe285bb7a26ed99ca9e Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 30 Jul 2025 07:51:51 -0400
Subject: [PATCH 29/61] fixing flash_attn alternating

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/benchmark.py               | 3 +++
 scripts/setups/prefix_tune_2d.conf | 3 +--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 1a7a5c84a..1e8c74365 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -1053,6 +1053,9 @@ def test_prefix_vllm_v1_attention(
     ]:
         pytest.skip()
 
+    if batch_composition == BatchComposition.ALTERNATING and implementation == Implementation.FLASH_ATTN:
+        pytest.skip("not supported")
+
     # TODO: Error: "Offset increment outside graph capture"
     #  for triton and flash_attn
     # if benchmark_mode == BenchmarkMode.CUDA_GRAPHS:
diff --git a/scripts/setups/prefix_tune_2d.conf b/scripts/setups/prefix_tune_2d.conf
index 1cd5a119b..b532bc9af 100644
--- a/scripts/setups/prefix_tune_2d.conf
+++ b/scripts/setups/prefix_tune_2d.conf
@@ -30,8 +30,7 @@ BENCHMARK_MODES = ["CUDA_GRAPHS"]
 # IMPLEMENTATION_UT = ["UNF_TRITON_2D"]
 # IMPLEMENTATION_UT = ["UNF_TRITON_2D_SIMPLE"]
 # IMPLEMENTATION_UT = ["FLASH_ATTN", "UNF_TRITON_2D"]
-# IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D", "FLASH_ATTN", "UNF_TRITON_2D", "UNF_TRITON_3D"]
-IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D", "UNF_TRITON_2D", "UNF_TRITON_3D"]
+IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D", "FLASH_ATTN", "UNF_TRITON_2D", "UNF_TRITON_3D"]
 # IMPLEMENTATION_UT = ["UNF_TRITON_3D"]
 
 # TRITON_BACKEND_DEBUG = 1

From 0dda12de2578dc4754ad00f4ef8e05d34436abe0 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Thu, 31 Jul 2025 04:19:16 -0400
Subject: [PATCH 30/61] fix sweep script

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/bench_vllm_user_range.py   | 14 +++++++++++---
 scripts/setups/prefix_tune_2d.conf |  3 ++-
 vllm                               |  2 +-
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/scripts/bench_vllm_user_range.py b/scripts/bench_vllm_user_range.py
index 435b62686..212930fa7 100644
--- a/scripts/bench_vllm_user_range.py
+++ b/scripts/bench_vllm_user_range.py
@@ -56,6 +56,8 @@ def create_dir_if_not_exist(path, mode=0o777):
 max_rounds = 64
 max_num_prompts = 1000
 
+bench_repetitions = 3
+
 timestamp_f = datetime.now().strftime("%Y-%m-%d_%H%M")
 
 # result_dir = (
@@ -92,10 +94,16 @@ def create_dir_if_not_exist(path, mode=0o777):
         f"--num-prompts {num_prompts} "
         f"--port 8803"
     )
-    print(cmd)
-    rv = os.system(cmd)
+    for i in range(bench_repetitions):
+        print(
+            f"====== Measuring max concurrency {max_concurrency} with {num_prompts} prompts; repetition {i}  ====="
+        )
+        print(cmd)
+        rv = os.system(cmd)
+        if rv != 0:
+            print(f"benchmark command returned {rv}, stopping...")
+            break
     if rv != 0:
-        print(f"benchmark command returned {rv}, stopping...")
         break
 
 end_time = datetime.now()
diff --git a/scripts/setups/prefix_tune_2d.conf b/scripts/setups/prefix_tune_2d.conf
index b532bc9af..1cd5a119b 100644
--- a/scripts/setups/prefix_tune_2d.conf
+++ b/scripts/setups/prefix_tune_2d.conf
@@ -30,7 +30,8 @@ BENCHMARK_MODES = ["CUDA_GRAPHS"]
 # IMPLEMENTATION_UT = ["UNF_TRITON_2D"]
 # IMPLEMENTATION_UT = ["UNF_TRITON_2D_SIMPLE"]
 # IMPLEMENTATION_UT = ["FLASH_ATTN", "UNF_TRITON_2D"]
-IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D", "FLASH_ATTN", "UNF_TRITON_2D", "UNF_TRITON_3D"]
+# IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D", "FLASH_ATTN", "UNF_TRITON_2D", "UNF_TRITON_3D"]
+IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D", "UNF_TRITON_2D", "UNF_TRITON_3D"]
 # IMPLEMENTATION_UT = ["UNF_TRITON_3D"]
 
 # TRITON_BACKEND_DEBUG = 1
diff --git a/vllm b/vllm
index c3df1cb7b..c4f7d6775 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit c3df1cb7bfc6d7305b7cf34a32ee2311feec883d
+Subproject commit c4f7d677590e140c2a22651b4cae73e3d4727b4f

From dc8af9d3d8603f2d10300a47b93b56967289ca60 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Thu, 31 Jul 2025 09:08:32 -0400
Subject: [PATCH 31/61] adding random range serve bench script

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/bench_vllm_user_range_random.py | 114 ++++++++++++++++++++++++
 vllm                                    |   2 +-
 2 files changed, 115 insertions(+), 1 deletion(-)
 create mode 100644 scripts/bench_vllm_user_range_random.py

diff --git a/scripts/bench_vllm_user_range_random.py b/scripts/bench_vllm_user_range_random.py
new file mode 100644
index 000000000..c11d1ccaf
--- /dev/null
+++ b/scripts/bench_vllm_user_range_random.py
@@ -0,0 +1,114 @@
+#  /*******************************************************************************
+#   * Copyright 2025 IBM Corporation
+#   *
+#   * Licensed under the Apache License, Version 2.0 (the "License");
+#   * you may not use this file except in compliance with the License.
+#   * You may obtain a copy of the License at
+#   *
+#   *     http://www.apache.org/licenses/LICENSE-2.0
+#   *
+#   * Unless required by applicable law or agreed to in writing, software
+#   * distributed under the License is distributed on an "AS IS" BASIS,
+#   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   * See the License for the specific language governing permissions and
+#   * limitations under the License.
+#  *******************************************************************************/
+#
+
+import os
+import sys
+import torch
+from datetime import datetime
+
+
+def create_dir_if_not_exist_recursive(path, mode=0o777):
+    norm_path = os.path.normpath(path)
+    paths_l = norm_path.split(os.sep)
+    path_walked = f"{os.sep}"
+    for p in paths_l:
+        if len(p) == 0:
+            continue
+        path_walked = os.path.join(path_walked, p)
+        create_dir_if_not_exist(path_walked, mode)
+
+
+def create_dir_if_not_exist(path, mode=0o777):
+    if not os.path.exists(path):
+        os.mkdir(path)
+        try:
+            os.chmod(path, mode)
+        except PermissionError as e:
+            print(f"can't set permission of directory {path}: {e}")
+
+if len(sys.argv) < 6:
+    print(f"Usage: {sys.argv[0]} <model_path> <input-len> <output-len> <testcase_name> <result_path>")
+    exit(-1)
+
+num_users_to_test = [1, 2, 4, 8, 16, 32, 64, 128]
+gpu_name = torch.cuda.get_device_name().replace(" ", "_").replace("/", "_")
+
+# model = "/model/llama3.1-8b/instruct/"
+model = sys.argv[1]
+input_len = sys.argv[2]
+output_len = sys.argv[3]
+testcase_name = sys.argv[4]
+result_path = os.path.abspath(sys.argv[5])
+
+# max_rounds = 128
+max_rounds = 64
+max_num_prompts = 1000
+
+bench_repetitions = 3
+
+timestamp_f = datetime.now().strftime("%Y-%m-%d_%H%M")
+
+# result_dir = (
+#     f"/results/{model.replace('/','-')}/{gpu_name}/{testcase_name}/exp_{timestamp_f}/"
+# )
+model_print_path = model.replace('/','-')
+if model_print_path[0:2] == './':
+    model_print_path = model_print_path[2:]
+result_dir = f"{result_path}/{model_print_path}/{gpu_name}/{testcase_name}/exp_{timestamp_f}/"
+
+bench_script = "/workspace/benchmarks/benchmark_serving.py"
+if not os.path.isfile(bench_script):
+    bench_script = "./vllm-triton-backend/vllm/benchmarks/benchmark_serving.py"
+    if not os.path.isfile(bench_script):
+        print(f"can't find benchmark script benchmark_serving.py")
+        exit(-1)
+
+# os.system(f"mkdir -p {result_dir}")
+create_dir_if_not_exist_recursive(result_dir)
+
+start_time = datetime.now()
+for max_concurrency in num_users_to_test:
+    num_prompts = (
+        max_num_prompts
+        if max_num_prompts // max_concurrency < max_rounds
+        else int(max_rounds * max_concurrency)
+    )
+    cmd = (
+        f"VLLM_USE_V1=1 python {bench_script} "
+        f"--model {model} "
+        f"--dataset-name random --random-input-len={input_len} --random-output-len={output_len} --ignore-eos "
+        f"--save-result --result-dir {result_dir} --max-concurrency {max_concurrency} "
+        f"--percentile-metrics ttft,tpot,itl,e2el --metric-percentiles 20,50,80,99 "
+        f"--num-prompts {num_prompts} "
+        f"--port 8803"
+    )
+    for i in range(bench_repetitions):
+        print(
+            f"====== Measuring max concurrency {max_concurrency} with {num_prompts} prompts; repetition {i}  ====="
+        )
+        print(cmd)
+        rv = os.system(cmd)
+        if rv != 0:
+            print(f"benchmark command returned {rv}, stopping...")
+            break
+    if rv != 0:
+        break
+
+end_time = datetime.now()
+print(f"results stored in: {result_dir}")
+os.system(f"ls -alh {result_dir}")
+print(f"Benchmark time: {end_time-start_time}")
diff --git a/vllm b/vllm
index c4f7d6775..50140e9b7 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit c4f7d677590e140c2a22651b4cae73e3d4727b4f
+Subproject commit 50140e9b7897976aaf0e4acef8a061bcbada1ac3

From 08a967e002278dca2404355940b89f52725d7f83 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Mon, 4 Aug 2025 08:53:25 -0400
Subject: [PATCH 32/61] making benchmark random range faster

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/bench_vllm_user_range_random.py | 20 ++++++++++++--------
 vllm                                    |  2 +-
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/scripts/bench_vllm_user_range_random.py b/scripts/bench_vllm_user_range_random.py
index c11d1ccaf..5f16c45b6 100644
--- a/scripts/bench_vllm_user_range_random.py
+++ b/scripts/bench_vllm_user_range_random.py
@@ -44,7 +44,8 @@ def create_dir_if_not_exist(path, mode=0o777):
     print(f"Usage: {sys.argv[0]} <model_path> <input-len> <output-len> <testcase_name> <result_path>")
     exit(-1)
 
-num_users_to_test = [1, 2, 4, 8, 16, 32, 64, 128]
+# num_users_to_test = [1, 2, 4, 8, 16, 32, 64, 128]
+num_users_to_test = [1, 2, 4, 8, 16, 32, 64]
 gpu_name = torch.cuda.get_device_name().replace(" ", "_").replace("/", "_")
 
 # model = "/model/llama3.1-8b/instruct/"
@@ -55,8 +56,10 @@ def create_dir_if_not_exist(path, mode=0o777):
 result_path = os.path.abspath(sys.argv[5])
 
 # max_rounds = 128
-max_rounds = 64
-max_num_prompts = 1000
+# max_rounds = 64
+# max_rounds = 16
+# max_num_prompts = 1000
+min_num_prompts = 16
 
 bench_repetitions = 3
 
@@ -82,11 +85,12 @@ def create_dir_if_not_exist(path, mode=0o777):
 
 start_time = datetime.now()
 for max_concurrency in num_users_to_test:
-    num_prompts = (
-        max_num_prompts
-        if max_num_prompts // max_concurrency < max_rounds
-        else int(max_rounds * max_concurrency)
-    )
+    # num_prompts = (
+    #     max_num_prompts
+    #     if max_num_prompts // max_concurrency < max_rounds
+    #     else int(max_rounds * max_concurrency)
+    # )
+    num_prompts = int(max(min_num_prompts, 2*max_concurrency))
     cmd = (
         f"VLLM_USE_V1=1 python {bench_script} "
         f"--model {model} "
diff --git a/vllm b/vllm
index 50140e9b7..cc898391b 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit 50140e9b7897976aaf0e4acef8a061bcbada1ac3
+Subproject commit cc898391bf40486fcb31252cc4e9b953fa80ffbe

From ded193b890c79fbe0d0e582ad618d77d00697c0b Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Tue, 5 Aug 2025 09:34:28 -0400
Subject: [PATCH 33/61] quantize g4 script

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/quantize_g4.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 scripts/quantize_g4.py

diff --git a/scripts/quantize_g4.py b/scripts/quantize_g4.py
new file mode 100644
index 000000000..e82bed56e
--- /dev/null
+++ b/scripts/quantize_g4.py
@@ -0,0 +1,39 @@
+import sys
+import os
+
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+# MODEL_ID = "ibm-granite/granite-4.0-tiny-preview"
+model_path = sys.argv[1]
+store_path = sys.argv[2]
+print(f"Quantizing {model_path} using FP8_DYNAMIC...")
+
+model = AutoModelForCausalLM.from_pretrained(
+    model_path, device_map="auto", torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+# Configure the simple PTQ quantization
+recipe = QuantizationModifier(
+  targets="Linear", scheme="FP8_DYNAMIC", 
+  ignore=[
+        "re:.*lm_head",
+        "re:.*self_attn",
+        "re:.*router",
+        # "re:.*block_sparse_moe.gate",
+        "re:.*moe*",
+  ]
+  )
+
+# Apply the quantization algorithm.
+oneshot(model=model, recipe=recipe)
+
+print(f"...done. Saving to {store_path}...")
+# Save the model: granite-4.0-tiny-preview-FP8-Dynamic
+# SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(store_path)
+tokenizer.save_pretrained(store_path)
+
+print("...done.")

From e5f50d587e0b4850ab3a76d1fcb3c08aabf5f366 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Fri, 8 Aug 2025 04:51:06 -0400
Subject: [PATCH 34/61] getting fp8 dynamic to work

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/quantize_g4.py   |  18 ++++---
 scripts/quantize_g4_2.py | 108 +++++++++++++++++++++++++++++++++++++++
 vllm                     |   2 +-
 3 files changed, 121 insertions(+), 7 deletions(-)
 create mode 100644 scripts/quantize_g4_2.py

diff --git a/scripts/quantize_g4.py b/scripts/quantize_g4.py
index e82bed56e..9c3a213de 100644
--- a/scripts/quantize_g4.py
+++ b/scripts/quantize_g4.py
@@ -2,6 +2,7 @@
 import os
 
 from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import GraniteMoeHybridForCausalLM
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 
@@ -13,6 +14,10 @@
 model = AutoModelForCausalLM.from_pretrained(
     model_path, device_map="auto", torch_dtype="auto",
 )
+# model = GraniteMoeHybridForCausalLM.from_pretrained(
+#     model_path, device_map="auto", torch_dtype="auto",
+# )
+# print(model)
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 
 # Configure the simple PTQ quantization
@@ -20,20 +25,21 @@
   targets="Linear", scheme="FP8_DYNAMIC", 
   ignore=[
         "re:.*lm_head",
-        "re:.*self_attn",
-        "re:.*router",
+        # "re:.*self_attn",
+        # "re:.*router",
         # "re:.*block_sparse_moe.gate",
-        "re:.*moe*",
+        # "re:.*moe*",
+        "re:.*block_sparse_moe",
   ]
   )
 
 # Apply the quantization algorithm.
 oneshot(model=model, recipe=recipe)
+#, output_dir=store_path)
 
 print(f"...done. Saving to {store_path}...")
-# Save the model: granite-4.0-tiny-preview-FP8-Dynamic
-# SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-model.save_pretrained(store_path)
+# # SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(store_path, save_compressed=True)
 tokenizer.save_pretrained(store_path)
 
 print("...done.")
diff --git a/scripts/quantize_g4_2.py b/scripts/quantize_g4_2.py
new file mode 100644
index 000000000..2afb038a9
--- /dev/null
+++ b/scripts/quantize_g4_2.py
@@ -0,0 +1,108 @@
+import sys
+import os
+
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+# MODEL_ID = "ibm-granite/granite-4.0-tiny-preview"
+model_path = sys.argv[1]
+store_path = sys.argv[2]
+print(f"Quantizing {model_path} using FP8...")
+
+
+from datasets import load_dataset
+from packaging.version import Version
+from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
+
+from llmcompressor.utils import dispatch_for_generation
+
+# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
+# select a Mixture of Experts model for quantization
+MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, torch_dtype="auto", trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+# its recommended to use more calibration samples for MoE models so each expert is hit
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 2048
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+
+
+model = AutoModelForCausalLM.from_pretrained(
+    model_path, device_map="auto", torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+# Configure the simple PTQ quantization
+recipe = QuantizationModifier(
+  targets="Linear", scheme="FP8", 
+  ignore=[
+        "re:.*lm_head",
+        "re:.*self_attn",
+        "re:.*router",
+        # "re:.*block_sparse_moe.gate",
+        "re:.*moe*",
+  ]
+  )
+
+# Apply the quantization algorithm.
+# oneshot(model=model, recipe=recipe)
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+)
+
+
+print(f"...done. Saving to {store_path}...")
+# Save the model: granite-4.0-tiny-preview-FP8-Dynamic
+# SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(store_path)
+tokenizer.save_pretrained(store_path)
+
+print("...done.")
diff --git a/vllm b/vllm
index cc898391b..f701d8648 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit cc898391bf40486fcb31252cc4e9b953fa80ffbe
+Subproject commit f701d8648bcc21ab65bb2b5637b6baf099278576

From 9f5e543a7de6051784c25a544c6f840d9561d845 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Fri, 8 Aug 2025 08:29:33 -0400
Subject: [PATCH 35/61] micro benchmark for vllm full cuda graph mode

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/benchmark.py                          | 86 +++++++++++--------
 .../setups/prefix_optimize_launchgrid.conf    | 42 +++++++++
 2 files changed, 92 insertions(+), 36 deletions(-)
 create mode 100644 scripts/setups/prefix_optimize_launchgrid.conf

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 1e8c74365..4d84ced28 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -104,6 +104,7 @@ class BatchComposition(Enum):
 PREFIX_PREFILL_SHARE_OF_DECODE = [0.0, 0.5, 1.0]
 PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0, 0.5]
 PREFIX_PREFILL_BATCH_COMPOSITION = [BatchComposition.ALTERNATING]
+RESERVE_INPUT_TOKEN_LENGTH = [None]
 
 HEAD_SIZES = [128]  # only powers of 2! for llama2 & 3
 # head_size * head_numbers = hidden_size
@@ -186,6 +187,7 @@ class BatchComposition(Enum):
     "MOE_K",
     "TP_FACTOR",
     "MOE_TOP_K",
+    "RESERVE_INPUT_TOKEN_LENGTH",
 ]
 # "BENCHMARK_MODES", "IMPLEMENTATION_UT" ]
 debug_env_vars = [
@@ -1005,6 +1007,7 @@ def test_prefill_vllm_v0_attention(
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("implementation", IMPLEMENTATION_UT)
 @pytest.mark.parametrize("max_value", MAX_VALUES)
+@pytest.mark.parametrize("reserved_query_length", RESERVE_INPUT_TOKEN_LENGTH)
 @pytest.mark.parametrize("benchmark_mode", BENCHMARK_MODES)
 @torch.inference_mode()
 def test_prefix_vllm_v1_attention(
@@ -1026,6 +1029,7 @@ def test_prefix_vllm_v1_attention(
     seed,
     implementation,
     max_value,
+    reserved_query_length,
     benchmark_mode,
 ):
     my_id = request.node.nodeid.split("::")[-1]
@@ -1034,6 +1038,9 @@ def test_prefix_vllm_v1_attention(
     realistic_prompt_mode = len(prompt_pattern) > 1
     gqa_mode = num_heads[0] != num_heads[1]
 
+    reserved_query_length = None if reserved_query_length in [None, 'none', -1, 0] else int(reserved_query_length)
+    skip_ref_impl = True if reserved_query_length is not None else False
+
     if torch.cuda.get_device_capability()[0] < 8:
         # reduce operations are not supported (?)
         pytest.skip()
@@ -1195,7 +1202,10 @@ def test_prefix_vllm_v1_attention(
 
     inner_exception = None
     try:
-        query = torch.empty(total_query_tokens, num_query_heads, head_size, dtype=dtype)
+        query_tensor_num_tokens_reserved = total_query_tokens
+        if reserved_query_length is not None:
+            query_tensor_num_tokens_reserved = reserved_query_length
+        query = torch.empty(query_tensor_num_tokens_reserved, num_query_heads, head_size, dtype=dtype)
         query.uniform_(-max_value, max_value)
 
         key = torch.empty(total_token_num, num_kv_heads, head_size, dtype=dtype)
@@ -1254,41 +1264,42 @@ def test_prefix_vllm_v1_attention(
             slot_mapping_lst.extend(slot_mapping_i)
         slot_mapping_t = torch.tensor(slot_mapping_lst, dtype=torch.int)
 
-        ref_reshape_and_cache_flash(
-            key,
-            value,
-            key_cache,
-            value_cache,
-            slot_mapping_t,
-            block_size,
-            total_token_num,
-        )
+        if not skip_ref_impl:
+            ref_reshape_and_cache_flash(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                slot_mapping_t,
+                block_size,
+                total_token_num,
+            )
 
-        ref_output = ref_prefix_prefill(
-            query,
-            num_queries_per_kv,
-            key_cache,
-            value_cache,
-            key,
-            value,
-            block_table_t,
-            b_seq_lens,
-            b_ctx_lens,
-            b_query_lens,
-            b_start_loc,
-            batch_size,
-            scale,
-            dtype,
-        )
-        # ref_output = ref_paged_attn(
-        #     query,
-        #     key_cache,
-        #     value_cache,
-        #     b_query_lens,
-        #     b_ctx_lens,
-        #     block_table_t,
-        #     scale,
-        # )
+            ref_output = ref_prefix_prefill(
+                query,
+                num_queries_per_kv,
+                key_cache,
+                value_cache,
+                key,
+                value,
+                block_table_t,
+                b_seq_lens,
+                b_ctx_lens,
+                b_query_lens,
+                b_start_loc,
+                batch_size,
+                scale,
+                dtype,
+            )
+            # ref_output = ref_paged_attn(
+            #     query,
+            #     key_cache,
+            #     value_cache,
+            #     b_query_lens,
+            #     b_ctx_lens,
+            #     block_table_t,
+            #     scale,
+            # )
 
         if implementation == Implementation.FLASH_ATTN:
             from callers import FlashAttnPrefixPrefillCaller as Caller
@@ -1377,10 +1388,12 @@ def test_prefix_vllm_v1_attention(
                     # captured += l  # + '|'
                     captured += l + " "
         # compare
-        if enforce_numerical_correctness:
+        if enforce_numerical_correctness and not skip_ref_impl:
             # for better reports
             triton.testing.assert_close(ref_output, output, atol=ATOL, rtol=RTOL)
             allclose_pass = True
+        elif skip_ref_impl:
+            allclose_pass = 'skipped'
         else:
             allclose_pass = torch.allclose(ref_output, output, atol=ATOL, rtol=RTOL)
 
@@ -1456,6 +1469,7 @@ def test_prefix_vllm_v1_attention(
                 "num_blocks": num_blocks,
                 "dtype": dtype,
                 "max_value": max_value,
+                "query_tensor_num_tokens_reserved": query_tensor_num_tokens_reserved,
                 "realistic_prompt_mode": realistic_prompt_mode,
                 "batch_composition": batch_composition,
                 "gqa_mode": gqa_mode,
diff --git a/scripts/setups/prefix_optimize_launchgrid.conf b/scripts/setups/prefix_optimize_launchgrid.conf
new file mode 100644
index 000000000..75fc0af56
--- /dev/null
+++ b/scripts/setups/prefix_optimize_launchgrid.conf
@@ -0,0 +1,42 @@
+BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64, 128]
+# BATCH_SIZES = [4]
+# order:  num_query_heads, num_kv_heads
+NUM_HEADS = [[32, 8]]
+
+SEQUENCE_LENGTHS = [16, 32, 64, 128, 512, 1024, 2048, 4096]
+# SEQUENCE_LENGTHS = [64]
+PREFIX_PREFILL_SHARE_OF_DECODE = [0.0, 0.5, 1.0]
+# PREFIX_PREFILL_SHARE_OF_DECODE = [0.0, 0.5]
+# PREFIX_PREFILL_SHARE_OF_DECODE = [0.5]
+PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0, 0.5]
+# PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.5]
+# PREFIX_PREFILL_BATCH_COMPOSITION = ["ALTERNATING"]
+PREFIX_PREFILL_BATCH_COMPOSITION = ["DEC_PRE"]
+# PREFIX_PREFILL_BATCH_COMPOSITION = ["DEC_PRE", "ALTERNATING"]
+
+# max model length granite4, 'none' means not to reserve more than in the batch
+# RESERVE_INPUT_TOKEN_LENGTH = ["none", 132096]  
+RESERVE_INPUT_TOKEN_LENGTH = [132096]  
+
+HEAD_SIZES = [128]  # only powers of 2! for llama2 & 3
+# head_size * head_numbers = hidden_size
+
+BLOCK_SIZES = [16]
+NUM_BLOCKS = [4321]  # "arbitrary values for testing..."
+
+PROMPT_PATTERNS = [[1.0], [0.1, 0.4, 0.5, 1.0, 0.2]]
+# PROMPT_PATTERNS = [[1.0]]
+
+MAX_VALUES = [1.0]
+# BENCHMARK_MODES = ["CUDA_EVENTS"]
+BENCHMARK_MODES = ["CUDA_GRAPHS"]
+
+# IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D", "UNF_TRITON_2D", "UNF_TRITON_3D"]
+IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D"]
+# IMPLEMENTATION_UT = ["UNF_TRITON_3D"]
+
+# TRITON_BACKEND_DEBUG = 1
+# STORE_TEST_RESULT_PATH=/results
+STORE_TEST_RESULT_PATH=./zrl-triton-results-and-notebooks/micro_benchmarks/raw_data/
+
+# TEST_ALLOW_INCORRECT = 1

From af5befc16ba09c43c84f3c3cf3e02d19e23627e8 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Tue, 12 Aug 2025 10:05:15 -0400
Subject: [PATCH 36/61] starting ws experiments

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../ibm_triton_lib/kernels/__init__.py        |  1 +
 .../default/cache.json                        |  8 +++
 .../kernels/triton_unified_attention_tuned.py |  7 ++
 scripts/benchmark.py                          |  4 ++
 scripts/callers/__init__.py                   |  1 +
 scripts/callers/unified_triton.py             | 70 ++++++++++++++++++-
 .../setups/prefix_optimize_launchgrid.conf    |  7 +-
 scripts/setups/tune_2d_ws.conf                | 30 ++++++++
 8 files changed, 124 insertions(+), 4 deletions(-)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
 create mode 100644 scripts/setups/tune_2d_ws.conf

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py b/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
index 1471acd33..3574f8e2b 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
@@ -69,6 +69,7 @@ def ConfigSpace(
 from .triton_unified_attention import unified_attention
 from .triton_unified_attention_simple import unified_attention as unified_attention_simple
 from .triton_unified_newtiles import unified_attention as unified_attention_newtiles
+from .triton_unified_attention_tuned import unified_attention as unified_attention_tuned
 
 from .mamba_ssm import selective_state_update
 
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
new file mode 100755
index 000000000..a4569e066
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py
index 7b1a17d23..13294b887 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py
@@ -237,6 +237,11 @@ def prefill_heuristics_2d(MAX_SEQ_Q, MAX_SEQ_K, AVG_SEQ_Q, AVG_SEQ_K):
         },
         num_warps=[2, 4, 8],
         num_stages=[1, 2, 4, 6, 8],
+        num_consumer_groups=[0, 2, 4],
+        num_buffers_warp_spec=[0, 3, 6],
+        conditions=[
+            lambda c: c.num_consumer_groups !=0 and c.num_buffers_warp_spec != 0,
+        ]
     ),
     # this list is longer, since it would be used for multiple models
     key=[
@@ -861,6 +866,8 @@ def unified_attention(
     assert causal, "Only causal attention is supported"
     assert q_descale is None, "Q scales not supported"
 
+    assert force_selection == 2  # only 2d is tuned for now
+
     block_size = v.shape[1]
     assert (
         q.element_size() >= 2 or block_size >= 32
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 4d84ced28..0f53a537b 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -74,6 +74,7 @@ class Implementation(Enum):
     NT_UNF_TRITON_3D = 17
     NT_UNF_TRITON_2D = 18
     NT_UNF_TRITON_AUTO = 19
+    UNF_TRITON_2D_TUNED = 20
 
 
 class BenchmarkMode(Enum):
@@ -1057,6 +1058,7 @@ def test_prefix_vllm_v1_attention(
         Implementation.NT_UNF_TRITON_3D,
         Implementation.NT_UNF_TRITON_2D,
         # Implementation.NT_UNF_TRITON_AUTO,
+        Implementation.UNF_TRITON_2D_TUNED,
     ]:
         pytest.skip()
 
@@ -1325,6 +1327,8 @@ def test_prefix_vllm_v1_attention(
             from callers import NewTilesUnifiedTriton2dAttentionCaller as Caller
         elif implementation == Implementation.NT_UNF_TRITON_AUTO:
             from callers import NewTilesUnifiedTritonAutoAttentionCaller as Caller
+        elif implementation == Implementation.UNF_TRITON_2D_TUNED:
+            from callers import TunedUnifiedTriton2dAttentionCaller as Caller
 
         if Caller.requires_allocated_output:
             output = torch.empty_like(query)
diff --git a/scripts/callers/__init__.py b/scripts/callers/__init__.py
index ad51a88e2..41131a013 100644
--- a/scripts/callers/__init__.py
+++ b/scripts/callers/__init__.py
@@ -57,6 +57,7 @@
     UnifiedTriton3dAttentionCaller,
     UnifiedTritonAutoAttentionCaller,
     SimpleUnifiedTriton2dAttentionCaller,
+    TunedUnifiedTriton2dAttentionCaller,
 )
 from .unified_triton_newtiles import (
     NewTilesUnifiedTriton2dAttentionCaller,
diff --git a/scripts/callers/unified_triton.py b/scripts/callers/unified_triton.py
index 3bc3aa5aa..10457d8da 100644
--- a/scripts/callers/unified_triton.py
+++ b/scripts/callers/unified_triton.py
@@ -17,7 +17,7 @@
 
 import torch
 
-from ibm_triton_lib.kernels import unified_attention, unified_attention_simple
+from ibm_triton_lib.kernels import unified_attention, unified_attention_simple, unified_attention_tuned
 from .base import PrefixPrefillCaller
 
 
@@ -229,3 +229,71 @@ def make_call_func(
             softmax_scale,
             force_selection=None,
         )  # none triggers vllm default behaviour
+
+
+class TunedUnifiedTriton2dAttentionCaller(PrefixPrefillCaller):
+    @staticmethod
+    def make_call_func(
+        output,
+        query,
+        key_cache,
+        value_cache,
+        key,
+        value,
+        block_tables,
+        seq_lens,
+        ctx_lens,
+        query_lens,
+        start_loc,
+        seq_start_loc,
+        softmax_scale,
+        # kv_cache_dtype,  # unused
+        force_selection=2,
+    ):
+        """
+        query: shape = [num_tokens, num_heads, head_size]
+        key: shape = [num_tokens, num_kv_heads, head_size]
+        value: shape = [num_tokens, num_kv_heads, head_size]
+        k_cache = [num_blocks, block_size, num_kv_heads, head_size]
+        v_cache = [num_blocks, block_size, num_kv_heads, head_size]
+        Returns:
+            shape = [num_tokens, num_heads, head_size]
+        """
+        assert force_selection == 2, "simple unified kernel is only applicable to 2d"
+
+        max_query_len = query_lens.max()
+        max_seqlen = seq_lens.max()
+
+        avg_seqlen_q = query_lens.to(torch.float).mean()
+        avg_seqlen_k = seq_lens.to(torch.float).mean()
+
+        def call_and_process_output():
+            # k must have shape (num_blocks, page_block_size, num_heads_k, head_size)
+            return unified_attention_tuned(
+                q=query,
+                k=key_cache,
+                v=value_cache,
+                out=output,
+                cu_seqlens_q=start_loc,
+                max_seqlen_q=max_query_len,
+                seqused_k=seq_lens,
+                max_seqlen_k=max_seqlen,
+                softmax_scale=softmax_scale,
+                causal=True,
+                window_size=(-1, -1),
+                block_table=block_tables,
+                softcap=0,
+                q_descale=None,
+                k_descale=None,  # TODO?
+                v_descale=None,  # TODO?
+                alibi_slopes=None,
+                avg_seqlen_q=avg_seqlen_q,
+                avg_seqlen_k=avg_seqlen_k,
+                force_selection=2,
+            )
+
+        return call_and_process_output
+
+    @staticmethod
+    def requires_allocated_output() -> bool:
+        return True
diff --git a/scripts/setups/prefix_optimize_launchgrid.conf b/scripts/setups/prefix_optimize_launchgrid.conf
index 75fc0af56..7849b11e7 100644
--- a/scripts/setups/prefix_optimize_launchgrid.conf
+++ b/scripts/setups/prefix_optimize_launchgrid.conf
@@ -16,7 +16,8 @@ PREFIX_PREFILL_BATCH_COMPOSITION = ["DEC_PRE"]
 
 # max model length granite4, 'none' means not to reserve more than in the batch
 # RESERVE_INPUT_TOKEN_LENGTH = ["none", 132096]  
-RESERVE_INPUT_TOKEN_LENGTH = [132096]  
+# RESERVE_INPUT_TOKEN_LENGTH = [132096]  
+RESERVE_INPUT_TOKEN_LENGTH = ["none"]  
 
 HEAD_SIZES = [128]  # only powers of 2! for llama2 & 3
 # head_size * head_numbers = hidden_size
@@ -28,8 +29,8 @@ PROMPT_PATTERNS = [[1.0], [0.1, 0.4, 0.5, 1.0, 0.2]]
 # PROMPT_PATTERNS = [[1.0]]
 
 MAX_VALUES = [1.0]
-# BENCHMARK_MODES = ["CUDA_EVENTS"]
-BENCHMARK_MODES = ["CUDA_GRAPHS"]
+BENCHMARK_MODES = ["CUDA_EVENTS"]
+# BENCHMARK_MODES = ["CUDA_GRAPHS"]
 
 # IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D", "UNF_TRITON_2D", "UNF_TRITON_3D"]
 IMPLEMENTATION_UT = ["NT_UNF_TRITON_2D", "NT_UNF_TRITON_3D"]
diff --git a/scripts/setups/tune_2d_ws.conf b/scripts/setups/tune_2d_ws.conf
new file mode 100644
index 000000000..517b9a441
--- /dev/null
+++ b/scripts/setups/tune_2d_ws.conf
@@ -0,0 +1,30 @@
+BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64, 128]
+# BATCH_SIZES = [4]
+# order:  num_query_heads, num_kv_heads
+NUM_HEADS = [[32, 8]]
+
+SEQUENCE_LENGTHS = [16, 32, 64, 128, 512, 1024, 2048, 4096]
+# SEQUENCE_LENGTHS = [64]
+PREFIX_PREFILL_SHARE_OF_DECODE = [0.0, 0.5]
+# PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0, 0.5]
+PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0]
+# PREFIX_PREFILL_BATCH_COMPOSITION = ["DEC_PRE"]
+
+HEAD_SIZES = [128]  # only powers of 2! for llama2 & 3
+BLOCK_SIZES = [16]
+NUM_BLOCKS = [4321]  # "arbitrary values for testing..."
+
+PROMPT_PATTERNS = [[1.0], [0.1, 0.4, 0.5, 1.0, 0.2]]
+# PROMPT_PATTERNS = [[1.0]]
+
+MAX_VALUES = [1.0]
+# BENCHMARK_MODES = ["CUDA_EVENTS"]
+BENCHMARK_MODES = ["CUDA_GRAPHS"]
+
+IMPLEMENTATION_UT = ["UNF_TRITON_2D_TUNED"]
+
+# TRITON_BACKEND_DEBUG = 1
+# STORE_TEST_RESULT_PATH=/results
+STORE_TEST_RESULT_PATH=./zrl-triton-results-and-notebooks/micro_benchmarks/raw_data/
+
+# TEST_ALLOW_INCORRECT = 1

From b2c94f1b12857c7e71ae786903e1e46d73b11e16 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Tue, 12 Aug 2025 10:05:38 -0400
Subject: [PATCH 37/61] another schema for fp8 tuning

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 scripts/quantize_g4.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/scripts/quantize_g4.py b/scripts/quantize_g4.py
index 9c3a213de..7d19f6e20 100644
--- a/scripts/quantize_g4.py
+++ b/scripts/quantize_g4.py
@@ -25,11 +25,8 @@
   targets="Linear", scheme="FP8_DYNAMIC", 
   ignore=[
         "re:.*lm_head",
-        # "re:.*self_attn",
-        # "re:.*router",
-        # "re:.*block_sparse_moe.gate",
-        # "re:.*moe*",
-        "re:.*block_sparse_moe",
+        # "re:.*block_sparse_moe",
+        "re:.*block_sparse_moe.router",
   ]
   )
 

From b510a5e87f40880924428d8d22217e49ac71c284 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 13 Aug 2025 04:29:43 -0400
Subject: [PATCH 38/61] first ws mb

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                        | 265 +++++++++++++++++-
 .../default/cache.json                        |  35 +++
 .../kernels/triton_unified_attention_tuned.py |  12 +-
 scripts/setups/tune_2d_ws.conf                |   4 +-
 triton-dejavu                                 |   2 +-
 vllm                                          |   2 +-
 6 files changed, 308 insertions(+), 12 deletions(-)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-96fc3b4e585fc8cfcb4fcdd974640839b5a5889cf4f54dbf57ad6a3439b671d0/default/cache.json

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
index a4569e066..e997541c2 100755
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
@@ -1,8 +1,263 @@
 {
     "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
+    "total_bench_time_s": 21377.796743392944,
+    "evaluated_configs": 2160,
+    "keys": [
+        "MAX_SEQ_Q",
+        "MAX_SEQ_K",
+        "AVG_SEQ_Q",
+        "AVG_SEQ_K",
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3"
+    ],
+    "cache": {
+        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.006423796992748976
+        ],
+        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.006897487211972475
+        ],
+        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.007865289226174355
+        ],
+        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.012806367129087448
+        ],
+        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.11409414559602737
+        ],
+        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.36400967836380005
+        ],
+        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            1.291664481163025
+        ],
+        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            4.830662727355957
+        ],
+        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0067154536955058575
+        ],
+        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.007009030785411596
+        ],
+        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.006567405071109533
+        ],
+        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.006921715103089809
+        ],
+        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.007554848212748766
+        ],
+        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.007870307192206383
+        ],
+        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.012347826734185219
+        ],
+        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.018965136259794235
+        ],
+        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.03259870782494545
+        ],
+        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.11627256125211716
+        ],
+        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0549253448843956
+        ],
+        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.37127885222435
+        ],
+        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.09950052946805954
+        ],
+        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            1.3021571636199951
+        ],
+        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.1874120533466339
+        ],
+        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            4.851548671722412
+        ],
+        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.006778071168810129
+        ],
+        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.006958519574254751
+        ],
+        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.006996186450123787
+        ],
+        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.006850973702967167
+        ],
+        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.00791214406490326
+        ],
+        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.006878295913338661
+        ],
+        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.013943970203399658
+        ],
+        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.02429494820535183
+        ],
+        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.03789611533284187
+        ],
+        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.15952551364898682
+        ],
+        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.5120749473571777
+        ],
+        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.03336550295352936
+        ],
+        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            1.803341269493103
+        ],
+        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            6.802962303161621
+        ],
+        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0067731114104390144
+        ],
+        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.007123402785509825
+        ],
+        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.013310004025697708
+        ],
+        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.006687874905765057
+        ],
+        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.00769382668659091
+        ],
+        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.014694097451865673
+        ],
+        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.006742445286363363
+        ],
+        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.00831019226461649
+        ],
+        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.02136719599366188
+        ],
+        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0192007627338171
+        ],
+        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.04041781276464462
+        ],
+        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.09291289746761322
+        ],
+        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.28874820470809937
+        ],
+        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.2564668357372284
+        ],
+        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.918175995349884
+        ],
+        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.02123316191136837
+        ],
+        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.7775593996047974
+        ],
+        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            3.24080228805542
+        ],
+        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            2.575653076171875
+        ],
+        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            12.103424072265625
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
 }
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-96fc3b4e585fc8cfcb4fcdd974640839b5a5889cf4f54dbf57ad6a3439b671d0/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-96fc3b4e585fc8cfcb4fcdd974640839b5a5889cf4f54dbf57ad6a3439b671d0/default/cache.json
new file mode 100755
index 000000000..6f91d97c5
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-96fc3b4e585fc8cfcb4fcdd974640839b5a5889cf4f54dbf57ad6a3439b671d0/default/cache.json
@@ -0,0 +1,35 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
+    "total_bench_time_s": 364.13932609558105,
+    "evaluated_configs": 540,
+    "keys": [
+        "MAX_SEQ_Q",
+        "MAX_SEQ_K",
+        "AVG_SEQ_Q",
+        "AVG_SEQ_K",
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3"
+    ],
+    "cache": {
+        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 3, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.005123822949826717
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py
index 13294b887..421453344 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py
@@ -237,10 +237,16 @@ def prefill_heuristics_2d(MAX_SEQ_Q, MAX_SEQ_K, AVG_SEQ_Q, AVG_SEQ_K):
         },
         num_warps=[2, 4, 8],
         num_stages=[1, 2, 4, 6, 8],
-        num_consumer_groups=[0, 2, 4],
-        num_buffers_warp_spec=[0, 3, 6],
+        # num_consumer_groups=[0, 2, 4],
+        # num_buffers_warp_spec=[0, 3, 6],
+        # num_consumer_groups=[2],
+        # num_buffers_warp_spec=[3],
+        num_consumer_groups=[2, 4],
+        num_buffers_warp_spec=[3, 6],
         conditions=[
-            lambda c: c.num_consumer_groups !=0 and c.num_buffers_warp_spec != 0,
+            # ensure consistency for ws
+            lambda c: (c.num_consumer_groups !=0 and c.num_buffers_warp_spec != 0) \
+                or (c.num_consumer_groups == 0 and c.num_buffers_warp_spec == 0),
         ]
     ),
     # this list is longer, since it would be used for multiple models
diff --git a/scripts/setups/tune_2d_ws.conf b/scripts/setups/tune_2d_ws.conf
index 517b9a441..88d73a27f 100644
--- a/scripts/setups/tune_2d_ws.conf
+++ b/scripts/setups/tune_2d_ws.conf
@@ -18,8 +18,8 @@ PROMPT_PATTERNS = [[1.0], [0.1, 0.4, 0.5, 1.0, 0.2]]
 # PROMPT_PATTERNS = [[1.0]]
 
 MAX_VALUES = [1.0]
-# BENCHMARK_MODES = ["CUDA_EVENTS"]
-BENCHMARK_MODES = ["CUDA_GRAPHS"]
+BENCHMARK_MODES = ["CUDA_EVENTS"]
+# BENCHMARK_MODES = ["CUDA_GRAPHS"]
 
 IMPLEMENTATION_UT = ["UNF_TRITON_2D_TUNED"]
 
diff --git a/triton-dejavu b/triton-dejavu
index 9de1daa0e..3ec45ef42 160000
--- a/triton-dejavu
+++ b/triton-dejavu
@@ -1 +1 @@
-Subproject commit 9de1daa0e61b056a23cf4796239629a8f6330995
+Subproject commit 3ec45ef425aca5bbcd026e2d32f9da6b4981f3c4
diff --git a/vllm b/vllm
index f701d8648..d7cc6ee33 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit f701d8648bcc21ab65bb2b5637b6baf099278576
+Subproject commit d7cc6ee330d93b0398f3ead75ab779d8a7a1042f

From 01280718edd4db31b99ab40ad024b99559af8146 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 13 Aug 2025 04:50:22 -0400
Subject: [PATCH 39/61] fix tuning error

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                               | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
index e997541c2..84d454b13 100755
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
@@ -1,6 +1,6 @@
 {
     "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
-    "total_bench_time_s": 21377.796743392944,
+    "total_bench_time_s": 21743.59187436104,
     "evaluated_configs": 2160,
     "keys": [
         "MAX_SEQ_Q",
@@ -17,7 +17,6 @@
         "stride_v_cache_3"
     ],
     "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
         "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
         "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
         "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
@@ -74,12 +73,10 @@
         "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
         "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
         "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 3, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
     },
     "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006423796992748976
-        ],
         "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
             0.006897487211972475
         ],
@@ -250,6 +247,9 @@
         ],
         "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
             12.103424072265625
+        ],
+        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0063226004131138325
         ]
     },
     "timings_data": {

From 836257cd8d8671ab80542d5c5d1edf029f84858a Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 13 Aug 2025 05:02:40 -0400
Subject: [PATCH 40/61] making it compatible with cuda graphs

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../kernels/triton_unified_attention_tuned.py | 302 +++++++++---------
 scripts/callers/unified_triton.py             |  10 +
 scripts/setups/tune_2d_ws.conf                |   7 +-
 3 files changed, 167 insertions(+), 152 deletions(-)

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py
index 421453344..04b4944e2 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py
@@ -866,6 +866,10 @@ def unified_attention(
     q_descale,
     k_descale,
     v_descale,
+    MAX_SEQ_Q,
+    MAX_SEQ_K,
+    AVG_SEQ_Q,
+    AVG_SEQ_K,
     alibi_slopes=None,
     force_selection=None,  # None, 2, 3 to select kernel
 ):
@@ -888,156 +892,156 @@ def unified_attention(
     num_queries_per_kv = num_query_heads // num_kv_heads
     head_size = q.shape[2]
 
-    MAX_SEQ_Q = triton.next_power_of_2(int(max_seqlen_q))
-    MAX_SEQ_K = triton.next_power_of_2(int(max_seqlen_k))
-    AVG_SEQ_Q = triton.next_power_of_2(int(avg_seqlen_q))
-    AVG_SEQ_K = triton.next_power_of_2(int(avg_seqlen_k))
+    # MAX_SEQ_Q = triton.next_power_of_2(int(max_seqlen_q))
+    # MAX_SEQ_K = triton.next_power_of_2(int(max_seqlen_k))
+    # AVG_SEQ_Q = triton.next_power_of_2(int(avg_seqlen_q))
+    # AVG_SEQ_K = triton.next_power_of_2(int(avg_seqlen_k))
 
     # if batch contains a prefill
-    if (max_seqlen_q > 1 or force_selection == 2) and force_selection != 3:
+    # if (max_seqlen_q > 1 or force_selection == 2) and force_selection != 3:
 
-        grid = lambda META: (
-            q.shape[0] // (META["BLOCK_M"] // num_queries_per_kv) + num_seqs,
-            num_kv_heads,
-        )
-
-        kernel_unified_attention_2d[grid](
-            output_ptr=out,
-            query_ptr=q,
-            key_cache_ptr=k,
-            value_cache_ptr=v,
-            block_tables_ptr=block_table,
-            seq_lens_ptr=seqused_k,
-            alibi_slopes_ptr=alibi_slopes,
-            scale=softmax_scale,
-            k_scale=k_descale,
-            v_scale=v_descale,
-            softcap=softcap,
-            num_query_heads=num_query_heads,
-            num_queries_per_kv=num_queries_per_kv,
-            block_table_stride=block_table.stride(0),
-            query_stride_0=q.stride(0),
-            query_stride_1=q.stride(1),
-            output_stride_0=out.stride(0),
-            output_stride_1=out.stride(1),
-            BLOCK_SIZE=block_size,
-            HEAD_SIZE=head_size,
-            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
-            USE_ALIBI_SLOPES=use_alibi_slopes,
-            USE_SOFTCAP=(softcap > 0),
-            SLIDING_WINDOW=(1 + window_size[0]),
-            stride_k_cache_0=k.stride(0),
-            stride_k_cache_1=k.stride(1),
-            stride_k_cache_2=k.stride(2),
-            stride_k_cache_3=k.stride(3),
-            stride_v_cache_0=v.stride(0),
-            stride_v_cache_1=v.stride(1),
-            stride_v_cache_2=v.stride(2),
-            stride_v_cache_3=v.stride(3),
-            query_start_len_ptr=cu_seqlens_q,
-            num_seqs=num_seqs,
-            MAX_SEQ_Q=MAX_SEQ_Q,
-            MAX_SEQ_K=MAX_SEQ_K,
-            AVG_SEQ_Q=AVG_SEQ_Q,
-            AVG_SEQ_K=AVG_SEQ_K,
-        )
-    else:
-        BLOCK_M = 64 if max_seqlen_q > 1 and avg_seqlen_q >= 4096 else 16
-        BLOCK_Q = BLOCK_M // num_queries_per_kv
-
-        # Ideally we would launch with kernel with:
-        # \sum_i[ceil(query_len[i] / BLOCK_Q)] blocks.
-        # However, it is slow to realize the query_lens on cpu.
-        # Instead we use upper-bound:
-        # \sum_i[ceil(query_len[i] / BLOCK_Q)]
-        #   <= \sum_i[floor(query_len[i] / BLOCK_Q) + 1]
-        #    = \sum_i[floor(query_len[i] / BLOCK_Q)] + num_seqs
-        #   <= floor(\sum_i(query_len[i]) / BLOCK_Q) + num_seqs
-        #    = floor(q.shape[0] / BLOCK_Q) + num_seqs
-        total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs
-
-        # for initial version, NUM_SEGMENTS = 16 is chosen as a default
-        # value that showed good performance in tests
-        NUM_SEGMENTS = 16
-
-        segm_output = torch.empty(
-            q.shape[0],
-            num_query_heads,
-            NUM_SEGMENTS,
-            triton.next_power_of_2(head_size),
-            dtype=torch.float32,
-            device=q.device,
-        )
-        segm_max = torch.empty(
-            q.shape[0],
-            num_query_heads,
-            NUM_SEGMENTS,
-            dtype=torch.float32,
-            device=q.device,
-        )
-        segm_expsum = torch.empty(
-            q.shape[0],
-            num_query_heads,
-            NUM_SEGMENTS,
-            dtype=torch.float32,
-            device=q.device,
-        )
-
-        kernel_unified_attention_3d[(total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)](
-            segm_output_ptr=segm_output,
-            segm_max_ptr=segm_max,
-            segm_expsum_ptr=segm_expsum,
-            query_ptr=q,
-            key_cache_ptr=k,
-            value_cache_ptr=v,
-            block_tables_ptr=block_table,
-            seq_lens_ptr=seqused_k,
-            alibi_slopes_ptr=alibi_slopes,
-            scale=softmax_scale,
-            k_scale=k_descale,
-            v_scale=v_descale,
-            softcap=softcap,
-            num_query_heads=num_query_heads,
-            num_queries_per_kv=num_queries_per_kv,
-            block_table_stride=block_table.stride(0),
-            query_stride_0=q.stride(0),
-            query_stride_1=q.stride(1),
-            BLOCK_SIZE=block_size,
-            HEAD_SIZE=head_size,
-            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
-            USE_ALIBI_SLOPES=use_alibi_slopes,
-            USE_SOFTCAP=(softcap > 0),
-            SLIDING_WINDOW=(1 + window_size[0]),
-            stride_k_cache_0=k.stride(0),
-            stride_k_cache_1=k.stride(1),
-            stride_k_cache_2=k.stride(2),
-            stride_k_cache_3=k.stride(3),
-            stride_v_cache_0=v.stride(0),
-            stride_v_cache_1=v.stride(1),
-            stride_v_cache_2=v.stride(2),
-            stride_v_cache_3=v.stride(3),
-            query_start_len_ptr=cu_seqlens_q,
-            BLOCK_Q=BLOCK_Q,
-            num_seqs=num_seqs,
-            BLOCK_M=BLOCK_M,
-            NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
-        )
+    grid = lambda META: (
+        q.shape[0] // (META["BLOCK_M"] // num_queries_per_kv) + num_seqs,
+        num_kv_heads,
+    )
 
-        reduce_segments[(q.shape[0], num_query_heads)](
-            output_ptr=out,
-            segm_output_ptr=segm_output,
-            segm_max_ptr=segm_max,
-            segm_expsum_ptr=segm_expsum,
-            seq_lens_ptr=seqused_k,
-            num_seqs=num_seqs,
-            num_query_heads=num_query_heads,
-            output_stride_0=out.stride(0),
-            output_stride_1=out.stride(1),
-            block_table_stride=block_table.stride(0),
-            BLOCK_SIZE=block_size,
-            HEAD_SIZE=head_size,
-            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
-            query_start_len_ptr=cu_seqlens_q,
-            BLOCK_Q=BLOCK_Q,
-            NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
-        )
+    kernel_unified_attention_2d[grid](
+        output_ptr=out,
+        query_ptr=q,
+        key_cache_ptr=k,
+        value_cache_ptr=v,
+        block_tables_ptr=block_table,
+        seq_lens_ptr=seqused_k,
+        alibi_slopes_ptr=alibi_slopes,
+        scale=softmax_scale,
+        k_scale=k_descale,
+        v_scale=v_descale,
+        softcap=softcap,
+        num_query_heads=num_query_heads,
+        num_queries_per_kv=num_queries_per_kv,
+        block_table_stride=block_table.stride(0),
+        query_stride_0=q.stride(0),
+        query_stride_1=q.stride(1),
+        output_stride_0=out.stride(0),
+        output_stride_1=out.stride(1),
+        BLOCK_SIZE=block_size,
+        HEAD_SIZE=head_size,
+        HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+        USE_ALIBI_SLOPES=use_alibi_slopes,
+        USE_SOFTCAP=(softcap > 0),
+        SLIDING_WINDOW=(1 + window_size[0]),
+        stride_k_cache_0=k.stride(0),
+        stride_k_cache_1=k.stride(1),
+        stride_k_cache_2=k.stride(2),
+        stride_k_cache_3=k.stride(3),
+        stride_v_cache_0=v.stride(0),
+        stride_v_cache_1=v.stride(1),
+        stride_v_cache_2=v.stride(2),
+        stride_v_cache_3=v.stride(3),
+        query_start_len_ptr=cu_seqlens_q,
+        num_seqs=num_seqs,
+        MAX_SEQ_Q=MAX_SEQ_Q,
+        MAX_SEQ_K=MAX_SEQ_K,
+        AVG_SEQ_Q=AVG_SEQ_Q,
+        AVG_SEQ_K=AVG_SEQ_K,
+    )
+    # else:
+    #     BLOCK_M = 64 if max_seqlen_q > 1 and avg_seqlen_q >= 4096 else 16
+    #     BLOCK_Q = BLOCK_M // num_queries_per_kv
+
+    #     # Ideally we would launch with kernel with:
+    #     # \sum_i[ceil(query_len[i] / BLOCK_Q)] blocks.
+    #     # However, it is slow to realize the query_lens on cpu.
+    #     # Instead we use upper-bound:
+    #     # \sum_i[ceil(query_len[i] / BLOCK_Q)]
+    #     #   <= \sum_i[floor(query_len[i] / BLOCK_Q) + 1]
+    #     #    = \sum_i[floor(query_len[i] / BLOCK_Q)] + num_seqs
+    #     #   <= floor(\sum_i(query_len[i]) / BLOCK_Q) + num_seqs
+    #     #    = floor(q.shape[0] / BLOCK_Q) + num_seqs
+    #     total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs
+
+    #     # for initial version, NUM_SEGMENTS = 16 is chosen as a default
+    #     # value that showed good performance in tests
+    #     NUM_SEGMENTS = 16
+
+    #     segm_output = torch.empty(
+    #         q.shape[0],
+    #         num_query_heads,
+    #         NUM_SEGMENTS,
+    #         triton.next_power_of_2(head_size),
+    #         dtype=torch.float32,
+    #         device=q.device,
+    #     )
+    #     segm_max = torch.empty(
+    #         q.shape[0],
+    #         num_query_heads,
+    #         NUM_SEGMENTS,
+    #         dtype=torch.float32,
+    #         device=q.device,
+    #     )
+    #     segm_expsum = torch.empty(
+    #         q.shape[0],
+    #         num_query_heads,
+    #         NUM_SEGMENTS,
+    #         dtype=torch.float32,
+    #         device=q.device,
+    #     )
+
+    #     kernel_unified_attention_3d[(total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)](
+    #         segm_output_ptr=segm_output,
+    #         segm_max_ptr=segm_max,
+    #         segm_expsum_ptr=segm_expsum,
+    #         query_ptr=q,
+    #         key_cache_ptr=k,
+    #         value_cache_ptr=v,
+    #         block_tables_ptr=block_table,
+    #         seq_lens_ptr=seqused_k,
+    #         alibi_slopes_ptr=alibi_slopes,
+    #         scale=softmax_scale,
+    #         k_scale=k_descale,
+    #         v_scale=v_descale,
+    #         softcap=softcap,
+    #         num_query_heads=num_query_heads,
+    #         num_queries_per_kv=num_queries_per_kv,
+    #         block_table_stride=block_table.stride(0),
+    #         query_stride_0=q.stride(0),
+    #         query_stride_1=q.stride(1),
+    #         BLOCK_SIZE=block_size,
+    #         HEAD_SIZE=head_size,
+    #         HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+    #         USE_ALIBI_SLOPES=use_alibi_slopes,
+    #         USE_SOFTCAP=(softcap > 0),
+    #         SLIDING_WINDOW=(1 + window_size[0]),
+    #         stride_k_cache_0=k.stride(0),
+    #         stride_k_cache_1=k.stride(1),
+    #         stride_k_cache_2=k.stride(2),
+    #         stride_k_cache_3=k.stride(3),
+    #         stride_v_cache_0=v.stride(0),
+    #         stride_v_cache_1=v.stride(1),
+    #         stride_v_cache_2=v.stride(2),
+    #         stride_v_cache_3=v.stride(3),
+    #         query_start_len_ptr=cu_seqlens_q,
+    #         BLOCK_Q=BLOCK_Q,
+    #         num_seqs=num_seqs,
+    #         BLOCK_M=BLOCK_M,
+    #         NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+    #     )
+
+    #     reduce_segments[(q.shape[0], num_query_heads)](
+    #         output_ptr=out,
+    #         segm_output_ptr=segm_output,
+    #         segm_max_ptr=segm_max,
+    #         segm_expsum_ptr=segm_expsum,
+    #         seq_lens_ptr=seqused_k,
+    #         num_seqs=num_seqs,
+    #         num_query_heads=num_query_heads,
+    #         output_stride_0=out.stride(0),
+    #         output_stride_1=out.stride(1),
+    #         block_table_stride=block_table.stride(0),
+    #         BLOCK_SIZE=block_size,
+    #         HEAD_SIZE=head_size,
+    #         HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+    #         query_start_len_ptr=cu_seqlens_q,
+    #         BLOCK_Q=BLOCK_Q,
+    #         NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+    #     )
diff --git a/scripts/callers/unified_triton.py b/scripts/callers/unified_triton.py
index 10457d8da..450465ae2 100644
--- a/scripts/callers/unified_triton.py
+++ b/scripts/callers/unified_triton.py
@@ -16,6 +16,7 @@
 #
 
 import torch
+import triton
 
 from ibm_triton_lib.kernels import unified_attention, unified_attention_simple, unified_attention_tuned
 from .base import PrefixPrefillCaller
@@ -266,6 +267,11 @@ def make_call_func(
 
         avg_seqlen_q = query_lens.to(torch.float).mean()
         avg_seqlen_k = seq_lens.to(torch.float).mean()
+    
+        MAX_SEQ_Q = triton.next_power_of_2(int(max_query_len))
+        MAX_SEQ_K = triton.next_power_of_2(int(max_seqlen))
+        AVG_SEQ_Q = triton.next_power_of_2(int(avg_seqlen_q))
+        AVG_SEQ_K = triton.next_power_of_2(int(avg_seqlen_k))
 
         def call_and_process_output():
             # k must have shape (num_blocks, page_block_size, num_heads_k, head_size)
@@ -289,6 +295,10 @@ def call_and_process_output():
                 alibi_slopes=None,
                 avg_seqlen_q=avg_seqlen_q,
                 avg_seqlen_k=avg_seqlen_k,
+                MAX_SEQ_Q=MAX_SEQ_Q,
+                MAX_SEQ_K=MAX_SEQ_K,
+                AVG_SEQ_Q=AVG_SEQ_Q,
+                AVG_SEQ_K=AVG_SEQ_K,
                 force_selection=2,
             )
 
diff --git a/scripts/setups/tune_2d_ws.conf b/scripts/setups/tune_2d_ws.conf
index 88d73a27f..5eb1304b5 100644
--- a/scripts/setups/tune_2d_ws.conf
+++ b/scripts/setups/tune_2d_ws.conf
@@ -18,10 +18,11 @@ PROMPT_PATTERNS = [[1.0], [0.1, 0.4, 0.5, 1.0, 0.2]]
 # PROMPT_PATTERNS = [[1.0]]
 
 MAX_VALUES = [1.0]
-BENCHMARK_MODES = ["CUDA_EVENTS"]
-# BENCHMARK_MODES = ["CUDA_GRAPHS"]
+# BENCHMARK_MODES = ["CUDA_EVENTS"]
+BENCHMARK_MODES = ["CUDA_GRAPHS"]
 
-IMPLEMENTATION_UT = ["UNF_TRITON_2D_TUNED"]
+# IMPLEMENTATION_UT = ["UNF_TRITON_2D_TUNED"]
+IMPLEMENTATION_UT = ["UNF_TRITON_2D_TUNED", "UNF_TRITON_2D_SIMPLE"]
 
 # TRITON_BACKEND_DEBUG = 1
 # STORE_TEST_RESULT_PATH=/results

From c1df8c6ed586dcf8080876d659531190c5c08c04 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 13 Aug 2025 07:25:51 -0400
Subject: [PATCH 41/61] adding kernels with static launch grid

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
Co-authored-by: Jan Van Lunteren <jvl@zurich.ibm.com>
---
 .../ibm_triton_lib/kernels/__init__.py        |   1 +
 .../ibm_triton_lib/kernels/tmp_triton_attn.py | 486 ++++++++++
 .../kernels/triton_unified_grid.py            | 865 ++++++++++++++++++
 scripts/benchmark.py                          |  12 +
 scripts/callers/__init__.py                   |   4 +
 scripts/callers/grid_triton.py                | 193 ++++
 scripts/setups/prefix_grid.conf               |  35 +
 triton-dejavu                                 |   2 +-
 8 files changed, 1597 insertions(+), 1 deletion(-)
 create mode 100644 ibm-triton-lib/ibm_triton_lib/kernels/tmp_triton_attn.py
 create mode 100644 ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
 create mode 100644 scripts/callers/grid_triton.py
 create mode 100644 scripts/setups/prefix_grid.conf

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py b/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
index 3574f8e2b..8332a82b5 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
@@ -70,6 +70,7 @@ def ConfigSpace(
 from .triton_unified_attention_simple import unified_attention as unified_attention_simple
 from .triton_unified_newtiles import unified_attention as unified_attention_newtiles
 from .triton_unified_attention_tuned import unified_attention as unified_attention_tuned
+from .triton_unified_grid import unified_attention as unified_attention_grid
 
 from .mamba_ssm import selective_state_update
 
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/tmp_triton_attn.py b/ibm-triton-lib/ibm_triton_lib/kernels/tmp_triton_attn.py
new file mode 100644
index 000000000..ba0242a30
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/tmp_triton_attn.py
@@ -0,0 +1,486 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with PagedAttention and Triton prefix prefill."""
+from dataclasses import dataclass
+from typing import ClassVar, Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm import envs
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.ops.chunked_prefill_paged_decode import (
+    chunked_prefill_paged_decode)
+from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.attention.ops.triton_unified_attention import unified_attention
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
+    reorder_batch_to_split_decodes_and_prefills)
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.worker.gpu_input_batch import InputBatch
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class TritonAttentionMetadata:
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    num_decodes: int
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    block_table: torch.Tensor
+    slot_mapping: torch.Tensor
+    use_split_kv: bool
+    segm_output: torch.Tensor
+    segm_max: torch.Tensor
+    segm_expsum: torch.Tensor
+    BLOCK_M_PREFILL: int
+    BLOCK_Q_PREFILL: int
+    BLOCK_M_DECODE: int
+    BLOCK_Q_DECODE: int
+    num_q_blocks: int
+    block_q_seq_boundaries: torch.Tensor
+
+    # For cascade attention.
+    use_cascade: bool
+    common_prefix_len: int
+    cu_prefix_query_lens: Optional[torch.Tensor]
+    prefix_kv_lens: Optional[torch.Tensor]
+    suffix_kv_lens: Optional[torch.Tensor]
+
+    # Optional aot scheduling
+    scheduler_metadata: Optional[torch.Tensor] = None
+    prefix_scheduler_metadata: Optional[torch.Tensor] = None
+
+
+class TritonAttentionMetadataBuilder(
+        AttentionMetadataBuilder[TritonAttentionMetadata]):
+    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.ALWAYS
+
+    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
+                 vllm_config: VllmConfig, device: torch.device):
+        self.device = device
+        self.block_size = kv_cache_spec.block_size
+        self.kv_cache_spec = kv_cache_spec
+
+        model_config = vllm_config.model_config
+        self.num_heads_q = model_config.get_num_attention_heads(
+            vllm_config.parallel_config)
+        self.num_heads_kv = model_config.get_num_kv_heads(
+            vllm_config.parallel_config)
+        self.headdim = model_config.get_head_size()
+
+    def reorder_batch(self, input_batch: InputBatch,
+                      scheduler_output: SchedulerOutput) -> bool:
+        return reorder_batch_to_split_decodes_and_prefills(input_batch,
+                                                           scheduler_output,
+                                                           decode_threshold=1)
+
+    def build_for_cudagraph_capture(
+        self, common_attn_metadata: CommonAttentionMetadata
+    ) -> TritonAttentionMetadata:
+        attn_metadata = self.build(0, common_attn_metadata)
+        # When doing full graph capture, setting seq_lens to
+        # max_model_len will cause graph capture to be extremely
+        # slow, so here we set it to 1.
+        attn_metadata.seq_lens.fill_(1)
+        return attn_metadata
+
+    def build(self,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata,
+              fast_build: bool = False) -> TritonAttentionMetadata:
+        num_actual_tokens = common_attn_metadata.num_actual_tokens
+        max_query_len = common_attn_metadata.max_query_len
+
+        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
+        query_start_loc = common_attn_metadata.query_start_loc
+        seq_lens = common_attn_metadata.seq_lens
+
+        query_lens = torch.diff(query_start_loc)
+        if max_query_len == 1:
+            num_decodes = len(seq_lens)
+        else:
+            num_decodes = torch.argmax((query_lens != 1).int()).item()
+
+        BLOCK_M_PREFILL = 64
+        BLOCK_M_DECODE  = 16
+        BLOCK_Q_PREFILL = BLOCK_M_PREFILL * self.num_heads_kv // self.num_heads_q
+        BLOCK_Q_DECODE  = BLOCK_M_DECODE  * self.num_heads_kv // self.num_heads_q
+
+        block_q_seq_boundaries = torch.cumsum(torch.cat([torch.tensor([0], dtype=query_lens.dtype, device=query_lens.device), torch.ceil(query_lens[num_decodes:] / BLOCK_Q_PREFILL).to(torch.int)]), dim=0)
+        num_q_blocks = block_q_seq_boundaries[-1].item()
+
+        block_table_tensor = common_attn_metadata.block_table_tensor
+        slot_mapping = common_attn_metadata.slot_mapping
+
+        use_split_kv = (num_q_blocks * self.num_heads_kv < 128)
+
+        NUM_SEGMENTS=16
+
+        if use_split_kv:
+            segm_output = torch.empty(
+                num_decodes,
+                self.num_heads_q,
+                NUM_SEGMENTS,
+                self.headdim, #triton.next_power_of_2(head_size),
+                dtype=torch.float32,
+                device=seq_lens.device,
+            )
+            segm_max = torch.empty(
+                num_decodes,
+                self.num_heads_q,
+                NUM_SEGMENTS,
+                dtype=torch.float32,
+                device=seq_lens.device,
+            )
+            segm_expsum = torch.empty(
+                num_decodes,
+                self.num_heads_q,
+                NUM_SEGMENTS,
+                dtype=torch.float32,
+                device=seq_lens.device,
+            )
+        else:
+            segm_output = None
+            segm_max = None
+            segm_expsum = None
+
+        use_cascade = common_prefix_len > 0
+
+        if use_cascade:
+            cu_prefix_query_lens = torch.tensor([0, num_actual_tokens],
+                                                dtype=torch.int32,
+                                                device=self.device)
+            prefix_kv_lens = torch.tensor([common_prefix_len],
+                                          dtype=torch.int32,
+                                          device=self.device)
+            suffix_kv_lens = (common_attn_metadata.seq_lens_cpu -
+                              common_prefix_len)
+            suffix_kv_lens = suffix_kv_lens.to(self.device)
+        else:
+            cu_prefix_query_lens = None
+            prefix_kv_lens = None
+            suffix_kv_lens = None
+            prefix_scheduler_metadata = None
+
+        attn_metadata = TritonAttentionMetadata(
+            num_actual_tokens=num_actual_tokens,
+            max_query_len=max_query_len,
+            query_start_loc=query_start_loc,
+            num_decodes=num_decodes,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table_tensor,
+            slot_mapping=slot_mapping,
+            use_cascade=use_cascade,
+            common_prefix_len=common_prefix_len,
+            cu_prefix_query_lens=cu_prefix_query_lens,
+            prefix_kv_lens=prefix_kv_lens,
+            suffix_kv_lens=suffix_kv_lens,
+            prefix_scheduler_metadata=prefix_scheduler_metadata,
+            use_split_kv=use_split_kv,
+            segm_output=segm_output,
+            segm_max=segm_max,
+            segm_expsum=segm_expsum,
+            BLOCK_M_PREFILL=BLOCK_M_PREFILL,
+            BLOCK_Q_PREFILL=BLOCK_Q_PREFILL,
+            BLOCK_M_DECODE=BLOCK_M_DECODE,
+            BLOCK_Q_DECODE=BLOCK_Q_DECODE,
+            num_q_blocks=num_q_blocks,
+            block_q_seq_boundaries=block_q_seq_boundaries,
+        )
+        return attn_metadata
+
+    def can_run_in_cudagraph(
+            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
+        # Full CUDA Graph always supported
+        return True
+
+
+class TritonAttentionBackend(AttentionBackend):
+
+    accept_output_buffer: bool = True
+
+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
+
+    @classmethod
+    def validate_head_size(cls, head_size: int) -> None:
+        supported_head_sizes = cls.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
+            attn_type = cls.__name__.removesuffix("Backend")
+            raise ValueError(
+                f"Head size {head_size} is not supported by {attn_type}. "
+                f"Supported head sizes are: {supported_head_sizes}. "
+                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
+                "FlexAttention backend which supports all head sizes.")
+
+    @staticmethod
+    def get_name() -> str:
+        return "TRITON_ATTN_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> type["TritonAttentionImpl"]:
+        return TritonAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> type["AttentionMetadata"]:
+        return TritonAttentionMetadata
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> tuple[int, ...]:
+        if block_size % 16 != 0:
+            raise ValueError("Block size must be a multiple of 16.")
+        return (num_blocks, 2, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+    @staticmethod
+    def get_builder_cls() -> type["TritonAttentionMetadataBuilder"]:
+        return TritonAttentionMetadataBuilder
+
+
+class TritonAttentionImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[list[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        kv_sharing_target_layer_name: Optional[int] = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        if sliding_window is None:
+            self.sliding_window = (-1, -1)
+        else:
+            self.sliding_window = (sliding_window - 1, 0)
+        self.kv_cache_dtype = kv_cache_dtype
+        if logits_soft_cap is None:
+            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
+
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        TritonAttentionBackend.validate_head_size(head_size)
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "TritonAttentionImpl")
+
+        self.fp8_dtype = current_platform.fp8_dtype()
+        self.force_prefill_decode_attn = \
+            envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlashAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+        output_scale: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FlashAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is not None, "Output tensor must be provided."
+
+        if output_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported"
+                " for TritonAttentionImpl")
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+
+        assert attn_metadata.use_cascade is False
+
+        # IMPORTANT!
+        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
+        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
+        # in this method. For example, `view` and `slice` (or `[:n]`) operations
+        # are surprisingly slow even in the case they do not invoke any GPU ops.
+        # Minimize the PyTorch ops in this method as much as possible.
+        # Whenever making a change in this method, please benchmark the
+        # performance to make sure it does not introduce any overhead.
+
+        use_prefill_decode_attn = self.force_prefill_decode_attn
+        num_actual_tokens = attn_metadata.num_actual_tokens
+
+        if use_prefill_decode_attn:
+            key_cache, value_cache = PagedAttention.split_kv_cache(
+                kv_cache, self.num_kv_heads, self.head_size)
+        else:
+            key_cache, value_cache = kv_cache.unbind(1)
+
+        if self.kv_sharing_target_layer_name is None:
+            # Reshape the input keys and values and store them in the cache.
+            # Skip this if sharing KV cache with an earlier attention layer.
+            if use_prefill_decode_attn:
+                PagedAttention.write_to_paged_cache(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
+            else:
+                torch.ops._C_cache_ops.reshape_and_cache_flash(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.slot_mapping,
+                    self.kv_cache_dtype,
+                    layer._k_scale,
+                    layer._v_scale,
+                )
+
+        if self.kv_cache_dtype.startswith("fp8"):
+            key_cache = key_cache.view(self.fp8_dtype)
+            value_cache = value_cache.view(self.fp8_dtype)
+            num_tokens, num_heads, head_size = query.shape
+            assert layer._q_scale == 1.0, \
+                "A non 1.0 q_scale is not currently supported."
+            if not current_platform.is_rocm():
+                # Skip Q quantization on ROCm, since dequantizing back to
+                # f32 in the attention kernel is not supported.
+                query, _ = ops.scaled_fp8_quant(
+                    query.reshape(
+                        (num_tokens, num_heads * head_size)).contiguous(),
+                    layer._q_scale)
+                query = query.reshape((num_tokens, num_heads, head_size))
+
+        cu_seqlens_q = attn_metadata.query_start_loc
+        num_decodes = attn_metadata.num_decodes
+        seqused_k = attn_metadata.seq_lens
+        max_seqlen_q = attn_metadata.max_query_len
+        max_seqlen_k = attn_metadata.max_seq_len
+        block_table = attn_metadata.block_table
+
+        use_split_kv = attn_metadata.use_split_kv
+        segm_output = attn_metadata.segm_output
+        segm_max = attn_metadata.segm_max
+        segm_expsum = attn_metadata.segm_expsum
+
+        BLOCK_M_PREFILL = attn_metadata.BLOCK_M_PREFILL
+        BLOCK_Q_PREFILL = attn_metadata.BLOCK_Q_PREFILL
+        BLOCK_M_DECODE  = attn_metadata.BLOCK_M_DECODE
+        BLOCK_Q_DECODE  = attn_metadata.BLOCK_Q_DECODE
+        num_q_blocks = attn_metadata.num_q_blocks
+        block_q_seq_boundaries = attn_metadata.block_q_seq_boundaries
+
+        if use_prefill_decode_attn:
+            # Compute attention and update output up to `num_actual_tokens`.
+            chunked_prefill_paged_decode(query=query[:num_actual_tokens],
+                                         key=key[:num_actual_tokens],
+                                         value=value[:num_actual_tokens],
+                                         output=output[:num_actual_tokens],
+                                         kv_cache_dtype=self.kv_cache_dtype,
+                                         key_cache=key_cache,
+                                         value_cache=value_cache,
+                                         block_table=block_table,
+                                         query_start_loc=cu_seqlens_q,
+                                         seq_lens=seqused_k,
+                                         max_seq_len=max_seqlen_k,
+                                         max_query_len=max_seqlen_q,
+                                         k_scale=layer._k_scale,
+                                         v_scale=layer._v_scale,
+                                         alibi_slopes=self.alibi_slopes,
+                                         sliding_window=self.sliding_window[0],
+                                         sm_scale=self.scale)
+        else:
+            descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
+
+            unified_attention(
+                q=query[:num_actual_tokens],
+                k=key_cache,
+                v=value_cache,
+                out=output[:num_actual_tokens],
+                cu_seqlens_q=cu_seqlens_q,
+                max_seqlen_q=max_seqlen_q,
+                num_decodes=num_decodes,
+                seqused_k=seqused_k,
+                max_seqlen_k=max_seqlen_k,
+                softmax_scale=self.scale,
+                causal=True,
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.sliding_window,
+                block_table=block_table,
+                softcap=self.logits_soft_cap,
+                q_descale=None,  # Not supported
+                k_descale=layer._k_scale.expand(descale_shape),
+                v_descale=layer._v_scale.expand(descale_shape),
+                use_split_kv=use_split_kv,
+                segm_output=segm_output,
+                segm_max=segm_max,
+                segm_expsum=segm_expsum,
+                BLOCK_M_PREFILL=BLOCK_M_PREFILL,
+                BLOCK_Q_PREFILL=BLOCK_Q_PREFILL,
+                BLOCK_M_DECODE=BLOCK_M_DECODE,
+                BLOCK_Q_DECODE=BLOCK_Q_DECODE,
+                num_q_blocks=num_q_blocks,
+                block_q_seq_boundaries=block_q_seq_boundaries
+            )
+
+        return output
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
new file mode 100644
index 000000000..c8b2489dd
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
@@ -0,0 +1,865 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Authors:
+#  - Burkhard Ringlein <ngl@zurich.ibm.com>
+#  - Jan van Lunteren <jvl@zurich.ibm.com>
+#  - Chih-Chieh Yang <chih.chieh.yang@ibm.com>
+#  - Thomas Parnell <tpa@zurich.ibm.com>
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.triton_utils import tl, triton
+
+logger = init_logger(__name__)
+
+
+@triton.jit
+def cdiv_fn(x, y):
+    return (x + y - 1) // y
+
+
+@triton.jit
+def apply_softcap(S, x):
+    Sdiv = S / x
+    p1 = tl.exp(Sdiv)
+    p2 = tl.exp(-Sdiv)
+    return x * (p1 - p2) / (p1 + p2)
+
+
+@triton.jit
+def find_seq_idx(boundary_ptr, target_idx, num_seqs):
+    left: tl.int32 = 0
+    right = num_seqs
+    while left < right:
+        mid = (left + right) // 2
+        val = tl.load(boundary_ptr + mid)
+        if val <= target_idx:
+            left = mid + 1
+        else:
+            right = mid
+    return left - 1
+
+
+@triton.jit
+def kernel_unified_attention_2d(
+        output_ptr,  # [num_tokens, num_query_heads, head_size]
+        query_ptr,  # [num_tokens, num_query_heads, head_size]
+        key_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+        value_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+        seq_lens_ptr,  # [num_seqs]
+        alibi_slopes_ptr,  # [num_query_heads]
+        qq_bias_ptr,  # [num_query_tokens, num_query_tokens]
+        scale,  # float32
+        k_scale,  # float32
+        v_scale,  # float32
+        softcap,  # float32
+        num_query_heads: tl.constexpr,  # int
+        num_queries_per_kv: tl.constexpr,  # int
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
+        qq_bias_stride_0: tl.int64,  # int
+        BLOCK_SIZE: tl.constexpr,  # int
+        TILE_SIZE: tl.constexpr,  # int must be power of 2
+        HEAD_SIZE: tl.constexpr,  # int
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        USE_QQ_BIAS: tl.constexpr,  # bool
+        USE_SOFTCAP: tl.constexpr,  # bool
+        SLIDING_WINDOW: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.constexpr,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.constexpr,  # int
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        num_seqs: tl.int32,
+        seq_idx_offset,  # int
+        BLOCK_M: tl.constexpr,  # int
+        block_q_seq_boundaries_ptr, # [num_prefills] or None
+        is_prefill: tl.constexpr,
+        max_q_block_idx: tl.int32,  # int
+        q_block_iterations: tl.int32,  # int
+):
+    if tl.program_id(0) * q_block_iterations > max_q_block_idx:
+        return
+
+    for q_block_global_idx in range(tl.program_id(0) * q_block_iterations, min((tl.program_id(0) + 1) * q_block_iterations, max_q_block_idx + 1)):
+        kv_head_idx = tl.program_id(1)
+    
+        if is_prefill:
+            seq_idx = find_seq_idx(block_q_seq_boundaries_ptr, q_block_global_idx, num_seqs)
+            q_block_start_idx = tl.load(block_q_seq_boundaries_ptr + seq_idx)
+        else:
+            seq_idx = q_block_global_idx
+            q_block_start_idx = seq_idx
+        seq_idx = seq_idx + seq_idx_offset
+    
+        q_block_local_idx = q_block_global_idx - q_block_start_idx
+    
+        cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
+        cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx + 1)
+    
+        cur_batch_query_len = cur_batch_in_all_stop_index \
+            - cur_batch_in_all_start_index
+    
+        #if q_block_local_idx * BLOCK_Q >= cur_batch_query_len:
+        #    return
+    
+        offs_m = tl.arange(0, BLOCK_M)
+        offs_d = tl.arange(0, HEAD_SIZE_PADDED)
+        offs_t = tl.arange(0, TILE_SIZE)
+        query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
+    
+        query_offset_0 = cur_batch_in_all_start_index + query_pos
+        query_offset_1 = kv_head_idx * num_queries_per_kv + \
+            offs_m % num_queries_per_kv
+        query_offset = (query_offset_0[:, None] * query_stride_0 +
+                        query_offset_1[:, None] * query_stride_1 + offs_d[None, :])
+    
+        dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1)
+        query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1)
+        query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1)
+    
+        # Q : (BLOCK_M, HEAD_SIZE_PADDED)
+        Q = tl.load(
+            query_ptr + query_offset,
+            mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+            other=0.0,
+        )
+    
+        block_table_offset = seq_idx * block_table_stride
+    
+        M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+        L = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+        acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32)
+    
+        # sequence len for this particular sequence
+        seq_len = tl.load(seq_lens_ptr + seq_idx)
+    
+        # context length for this particular sequences
+        context_len = seq_len - cur_batch_query_len
+    
+        # alibi slope for this head
+        if USE_ALIBI_SLOPES:
+            alibi_slope = tl.load(alibi_slopes_ptr + query_offset_1,
+                                  mask=query_mask_1,
+                                  other=0.0)
+    
+        # query-query attention bias
+        if USE_QQ_BIAS:
+            qq_bias_row_ptrs = (qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0
+                                )  # shape: [BLOCK_M]
+    
+        # compute the length of the longest sequence prefix spanned by any
+        # query token in the current q_block (q_block_local_idx)
+        max_seq_prefix_len = context_len + q_block_local_idx * BLOCK_Q + (
+            BLOCK_M - 1) // num_queries_per_kv + 1
+    
+        # adjust for potential padding in the last q_block by considering the
+        # actual sequence length
+        max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
+    
+        # calculate the number of tiles that need to be processed to
+        # cover the longest sequence prefix (due to causal masking, tiles beyond
+        # this prefix can be skipped)
+        num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE)
+    
+        # iterate through tiles
+        for j in range(0, num_tiles):
+            seq_offset = j * TILE_SIZE + offs_t
+            tile_mask = seq_offset < max_seq_prefix_len
+    
+            physical_block_idx = tl.load(block_tables_ptr + block_table_offset +
+                                         seq_offset // BLOCK_SIZE).to(tl.int64)
+    
+            v_offset = (physical_block_idx[:, None] * stride_v_cache_0 +
+                        kv_head_idx * stride_v_cache_2 +
+                        offs_d[None, :] * stride_v_cache_3 +
+                        (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1)
+    
+            k_offset = (physical_block_idx[None, :] * stride_k_cache_0 +
+                        kv_head_idx * stride_k_cache_2 +
+                        offs_d[:, None] * stride_k_cache_3 +
+                        (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1)
+    
+            # K : (HEAD_SIZE, TILE_SIZE)
+            K_load = tl.load(key_cache_ptr + k_offset,
+                             mask=dim_mask[:, None] & tile_mask[None, :],
+                             other=0.0)
+    
+            if K_load.dtype.is_fp8():
+                if Q.dtype.is_fp8():
+                    K = K_load
+                else:
+                    K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype)
+            else:
+                K = K_load
+    
+            # V : (TILE_SIZE, HEAD_SIZE)
+            V_load = tl.load(value_cache_ptr + v_offset,
+                             mask=dim_mask[None, :] & tile_mask[:, None],
+                             other=0.0)
+    
+            if V_load.dtype.is_fp8():
+                if Q.dtype.is_fp8():
+                    V = V_load
+                else:
+                    V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype)
+            else:
+                V = V_load
+    
+            seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
+    
+            # S : (BLOCK_M, TILE_SIZE)
+            S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32)
+    
+            S += scale * tl.dot(Q, K)
+    
+            if USE_SOFTCAP:
+                S = apply_softcap(S, softcap)
+    
+            S = tl.where(query_mask_1[:, None] & query_mask_0[:, None] & seq_mask,
+                         S, float("-inf"))
+    
+            if SLIDING_WINDOW > 0:
+                S = tl.where((context_len + query_pos[:, None] - seq_offset)
+                             < SLIDING_WINDOW, S, float("-inf"))
+    
+            if USE_ALIBI_SLOPES:
+                S += alibi_slope[:, None] * (seq_offset - context_len)
+    
+            if USE_QQ_BIAS:
+                # compute key positions relative to query section
+                key_rel_pos = seq_offset - context_len  # shape: [BLOCK_SIZE]
+                # load bias only for keys that correspond to queries
+                is_query_key = key_rel_pos >= 0 and key_rel_pos < qq_bias_stride_0
+                qq_bias = tl.load(
+                    qq_bias_row_ptrs + key_rel_pos[None, :],
+                    mask=is_query_key[None, :],  # avoid OOB for context keys
+                    other=0.0,
+                )
+                S += qq_bias
+    
+            # compute running maximum
+            # m_j : (BLOCK_M,)
+            m_j = tl.maximum(M, tl.max(S, axis=1))
+    
+            # For sliding window there's a chance the max is -inf due to masking of
+            # the entire row. In this case we need to set m_j 0 to avoid NaN
+            m_j = tl.where(m_j > float("-inf"), m_j, 0.0)
+    
+            # P : (BLOCK_M, TILE_SIZE)
+            P = tl.exp(S - m_j[:, None])
+    
+            # l_j : (BLOCK_M,)
+            l_j = tl.sum(P, axis=1)
+    
+            # alpha : (BLOCK_M, )
+            alpha = tl.exp(M - m_j)
+    
+            # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+            acc = acc * alpha[:, None]
+    
+            # update constants
+            L = L * alpha + l_j
+            M = m_j
+    
+            # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+            acc += tl.dot(P.to(V.dtype), V)
+    
+        # epilogue
+        acc = acc / L[:, None]
+    
+        output_offset = (query_offset_0[:, None] * output_stride_0 +
+                         query_offset_1[:, None] * output_stride_1 +
+                         offs_d[None, :])
+    
+        tl.store(
+            output_ptr + output_offset,
+            acc,
+            mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+        )
+
+
+@triton.jit
+def kernel_unified_attention_3d(
+        segm_output_ptr,
+        # [num_tokens, num_query_heads, num_segments, head_size]
+        segm_max_ptr,  # [num_tokens, num_query_heads, num_segments]
+        segm_expsum_ptr,  # [num_tokens, num_query_heads, num_segments]
+        query_ptr,  # [num_tokens, num_query_heads, head_size]
+        key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
+        value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
+        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+        seq_lens_ptr,  # [num_seqs]
+        alibi_slopes_ptr,  # [num_query_heads]
+        qq_bias_ptr,  # [num_query_tokens, num_query_tokens]
+        scale,  # float32
+        k_scale,  # float32
+        v_scale,  # float32
+        softcap,  # float32
+        num_query_heads: tl.constexpr,  # int
+        num_queries_per_kv: tl.constexpr,  # int
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        qq_bias_stride_0: tl.int64,  # int
+        BLOCK_SIZE: tl.constexpr,  # int
+        TILE_SIZE: tl.constexpr,  # int, must be power of 2
+        HEAD_SIZE: tl.constexpr,  # int
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        USE_QQ_BIAS: tl.constexpr,  # bool
+        USE_SOFTCAP: tl.constexpr,  # bool
+        SLIDING_WINDOW: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.constexpr,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.constexpr,  # int
+        query_start_len_ptr,  # [num_seqs+1]
+        BLOCK_Q: tl.constexpr,  # int
+        num_seqs: tl.int32,
+        BLOCK_M: tl.constexpr,  # int
+        NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+        seq_idx_iterations: tl.int32,  # int
+):
+    if tl.program_id(0) * seq_idx_iterations >= num_seqs:
+        return
+
+    for seq_idx in range(tl.program_id(0) * seq_idx_iterations, min((tl.program_id(0) + 1) * seq_idx_iterations, num_seqs)):
+        kv_head_idx = tl.program_id(1)
+        segm_idx = tl.program_id(2)
+    
+        # sequence len for this particular sequence
+        seq_len = tl.load(seq_lens_ptr + seq_idx)
+    
+        # number of segments for this particular sequence
+        num_segments = NUM_SEGMENTS_PER_SEQ
+        tiles_per_segment = cdiv_fn(seq_len, num_segments * TILE_SIZE)
+    
+        #if segm_idx * tiles_per_segment * TILE_SIZE >= seq_len:
+        #    return
+    
+        offs_m = tl.arange(0, BLOCK_M)
+        offs_d = tl.arange(0, HEAD_SIZE_PADDED)
+        offs_t = tl.arange(0, TILE_SIZE)
+        query_pos = offs_m // num_queries_per_kv
+    
+        query_offset_0 = seq_idx + query_pos #cur_batch_in_all_start_index + query_pos
+        query_offset_1 = kv_head_idx * num_queries_per_kv + \
+            offs_m % num_queries_per_kv
+        query_offset = (query_offset_0[:, None] * query_stride_0 +
+                        query_offset_1[:, None] * query_stride_1 + offs_d[None, :])
+    
+        dim_mask = tl.where(offs_d < HEAD_SIZE, 1, 0).to(tl.int1)
+        query_mask_0 = tl.where(query_pos < 1, 1, 0).to(tl.int1)
+        query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1)
+    
+        # Q : (BLOCK_M, HEAD_SIZE_PADDED)
+        Q = tl.load(
+            query_ptr + query_offset,
+            mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+            other=0.0,
+        )
+    
+        block_table_offset = seq_idx * block_table_stride
+    
+        M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+        L = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+        acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32)
+    
+        # context length for this particular sequences
+        context_len = seq_len - 1
+    
+        # alibi slope for this head
+        if USE_ALIBI_SLOPES:
+            alibi_slope = tl.load(alibi_slopes_ptr + query_offset_1,
+                                  mask=query_mask_1,
+                                  other=0.0)
+    
+        # query-query attention bias
+        if USE_QQ_BIAS:
+            qq_bias_row_ptrs = (qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0
+                                )  # shape: [BLOCK_M]
+    
+        num_tiles = cdiv_fn(seq_len, TILE_SIZE)
+    
+        # iterate through tiles within current segment
+        for j in range(
+                segm_idx * tiles_per_segment,
+                min((segm_idx + 1) * tiles_per_segment, num_tiles),
+        ):
+            seq_offset = j * TILE_SIZE + offs_t
+            tile_mask = seq_offset < seq_len
+    
+            physical_block_idx = tl.load(block_tables_ptr + block_table_offset +
+                                         seq_offset // BLOCK_SIZE).to(tl.int64)
+    
+            v_offset = (physical_block_idx[:, None] * stride_v_cache_0 +
+                        kv_head_idx * stride_v_cache_2 +
+                        offs_d[None, :] * stride_v_cache_3 +
+                        (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1)
+    
+            k_offset = (physical_block_idx[None, :] * stride_k_cache_0 +
+                        kv_head_idx * stride_k_cache_2 +
+                        offs_d[:, None] * stride_k_cache_3 +
+                        (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1)
+    
+            # K : (HEAD_SIZE, TILE_SIZE)
+            K_load = tl.load(key_cache_ptr + k_offset,
+                             mask=dim_mask[:, None] & tile_mask[None, :],
+                             other=0.0)
+    
+            if K_load.dtype.is_fp8():
+                if Q.dtype.is_fp8():
+                    K = K_load
+                else:
+                    K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype)
+            else:
+                K = K_load
+    
+            # V : (TILE_SIZE, HEAD_SIZE)
+            V_load = tl.load(value_cache_ptr + v_offset,
+                             mask=dim_mask[None, :] & tile_mask[:, None],
+                             other=0.0)
+    
+            if V_load.dtype.is_fp8():
+                if Q.dtype.is_fp8():
+                    V = V_load
+                else:
+                    V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype)
+            else:
+                V = V_load
+    
+            seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
+    
+            # S : (BLOCK_M, TILE_SIZE)
+            S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32)
+            S += scale * tl.dot(Q, K)
+    
+            if USE_SOFTCAP:
+                S = apply_softcap(S, softcap)
+    
+            S = tl.where(query_mask_1[:, None] & query_mask_0[:, None] & seq_mask,
+                         S, float("-inf"))
+    
+            if SLIDING_WINDOW > 0:
+                S = tl.where((context_len + query_pos[:, None] - seq_offset)
+                             < SLIDING_WINDOW, S, float("-inf"))
+    
+            if USE_ALIBI_SLOPES:
+                S += alibi_slope[:, None] * (seq_offset - context_len)
+    
+            if USE_QQ_BIAS:
+                # compute key positions relative to query section
+                key_rel_pos = seq_offset - context_len  # shape: [BLOCK_SIZE]
+                # load bias only for keys that correspond to queries
+                is_query_key = key_rel_pos >= 0 and key_rel_pos < qq_bias_stride_0
+                qq_bias = tl.load(
+                    qq_bias_row_ptrs + key_rel_pos[None, :],
+                    mask=is_query_key[None, :],  # avoid OOB for context keys
+                    other=0.0,
+                )
+                S += qq_bias
+    
+            # compute running maximum
+            # m_j : (BLOCK_M,)
+            m_j = tl.maximum(M, tl.max(S, axis=1))
+    
+            # For sliding window there's a chance the max is -inf due to masking of
+            # the entire row. In this case we need to set m_j 0 to avoid NaN
+            m_j = tl.where(m_j > float("-inf"), m_j, 0.0)
+    
+            # P : (BLOCK_M, TILE_SIZE,)
+            P = tl.exp(S - m_j[:, None])
+    
+            # l_j : (BLOCK_M,)
+            l_j = tl.sum(P, axis=1)
+    
+            # alpha : (BLOCK_M, )
+            alpha = tl.exp(M - m_j)
+    
+            # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+            acc = acc * alpha[:, None]
+    
+            # update constants
+            L = L * alpha + l_j
+            M = m_j
+    
+            # acc : (BLOCK_M, HEAD_SIZE_PADDED)
+            acc += tl.dot(P.to(V.dtype), V)
+    
+        segm_output_offset = (
+            query_offset_0[:, None].to(tl.int64) *
+            (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+            query_offset_1[:, None] * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+            segm_idx * HEAD_SIZE_PADDED + tl.arange(0, HEAD_SIZE_PADDED)[None, :])
+        tl.store(
+            segm_output_ptr + segm_output_offset,
+            acc,
+            mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None],
+        )
+        segm_offset = (query_offset_0.to(tl.int64) *
+                       (num_query_heads * NUM_SEGMENTS_PER_SEQ) +
+                       query_offset_1 * NUM_SEGMENTS_PER_SEQ + segm_idx)
+        tl.store(segm_max_ptr + segm_offset, M, mask=query_mask_0 & query_mask_1)
+        tl.store(segm_expsum_ptr + segm_offset,
+                 L,
+                 mask=query_mask_0 & query_mask_1)
+
+
+@triton.jit
+def reduce_segments(
+        output_ptr,  # [num_tokens, num_query_heads, head_size]
+        segm_output_ptr,
+        #[num_tokens, num_query_heads, max_num_segments, head_size]
+        segm_max_ptr,  # [num_tokens, num_query_heads, max_num_segments]
+        segm_expsum_ptr,  # [num_tokens, num_query_heads, max_num_segments]
+        seq_lens_ptr,  # [num_seqs]
+        num_seqs,  # int
+        num_query_heads: tl.constexpr,  # int
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
+        block_table_stride: tl.int64,  # int
+        TILE_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int, must be power of 2
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        query_start_len_ptr,  # [num_seqs+1]
+        NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
+        seq_idx_iterations: tl.int32,  # int
+):
+    if tl.program_id(0) * seq_idx_iterations >= num_seqs:
+        return
+
+    for seq_idx in range(tl.program_id(0) * seq_idx_iterations, min((tl.program_id(0) + 1) * seq_idx_iterations, num_seqs)):
+        query_head_idx = tl.program_id(1)
+    
+        # sequence len for this particular sequence
+        seq_len = tl.load(seq_lens_ptr + seq_idx)
+    
+        # number of segments for this particular sequence
+        num_segments = NUM_SEGMENTS_PER_SEQ
+        tiles_per_segment = cdiv_fn(seq_len, num_segments * TILE_SIZE)
+    
+        # create masks for subsequent loads
+        act_num_segments = cdiv_fn(seq_len, tiles_per_segment * TILE_SIZE)
+        segm_mask = tl.arange(0, NUM_SEGMENTS_PER_SEQ) < tl.full(
+            [NUM_SEGMENTS_PER_SEQ], act_num_segments, dtype=tl.int32)
+        dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1,
+                            0).to(tl.int1)
+    
+        # load segment maxima
+        segm_offset = (seq_idx.to(tl.int64) *
+                       (num_query_heads * NUM_SEGMENTS_PER_SEQ) +
+                       query_head_idx * NUM_SEGMENTS_PER_SEQ +
+                       tl.arange(0, NUM_SEGMENTS_PER_SEQ))
+        segm_max = tl.load(segm_max_ptr + segm_offset,
+                           mask=segm_mask,
+                           other=float("-inf"))
+        overall_max = tl.max(segm_max)
+    
+        # load and rescale segment exp sums
+        segm_expsum = tl.load(segm_expsum_ptr + segm_offset,
+                              mask=segm_mask,
+                              other=0.0)
+        segm_expsum = segm_expsum * tl.exp(segm_max - overall_max)
+        overall_expsum = tl.sum(segm_expsum)
+    
+        # load, rescale, and add segment attention outputs
+        segm_output_offset = (
+            seq_idx.to(tl.int64) *
+            (num_query_heads * NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+            query_head_idx * (NUM_SEGMENTS_PER_SEQ * HEAD_SIZE_PADDED) +
+            tl.arange(0, NUM_SEGMENTS_PER_SEQ)[:, None] * HEAD_SIZE_PADDED +
+            tl.arange(0, HEAD_SIZE_PADDED)[None, :])
+        segm_output = tl.load(
+            segm_output_ptr + segm_output_offset,
+            mask=segm_mask[:, None] & dim_mask[None, :],
+            other=0.0,
+        )
+        segm_output *= tl.exp(segm_max - overall_max)[:, None]
+        acc_sum = tl.sum(segm_output, axis=0)
+        # safely divide by overall_expsum, returning 0.0 if overall_expsum is 0
+        acc = tl.where(overall_expsum == 0.0, 0.0, acc_sum / overall_expsum)
+    
+        # write result
+        output_offset = (seq_idx * output_stride_0 +
+                         query_head_idx * output_stride_1 +
+                         tl.arange(0, HEAD_SIZE_PADDED))
+        tl.store(output_ptr + output_offset, acc, mask=dim_mask)
+
+
+def unified_attention(
+    q,
+    k,
+    v,
+    out,
+    cu_seqlens_q,
+    max_seqlen_q,
+    num_decodes,
+    seqused_k,
+    max_seqlen_k,
+    softmax_scale,
+    causal,
+    window_size,
+    block_table,
+    softcap,
+    q_descale,
+    k_descale,
+    v_descale,
+    use_split_kv,
+    segm_output,
+    segm_max,
+    segm_expsum,
+    BLOCK_M_PREFILL,
+    BLOCK_Q_PREFILL,
+    BLOCK_M_DECODE,
+    BLOCK_Q_DECODE,
+    num_q_blocks,
+    block_q_seq_boundaries,
+    alibi_slopes=None,
+    qq_bias=None,
+):
+    assert causal, "Only causal attention is supported"
+    assert q_descale is None, "Q scales not supported"
+
+    block_size = v.shape[1]
+    assert q.element_size() >= 2 or block_size >= 32, \
+        "Block size must be at least 32 for fp8"
+
+    use_alibi_slopes = alibi_slopes is not None
+    use_qq_bias = qq_bias is not None
+
+    block_size = v.shape[1]
+    num_seqs = len(seqused_k)
+    num_query_heads = q.shape[1]
+    num_kv_heads = k.shape[2]
+    num_queries_per_kv = num_query_heads // num_kv_heads
+    head_size = q.shape[2]
+
+    TILE_SIZE_PREFILL = 32
+    TILE_SIZE_DECODE = 32
+
+    LAUNCH_GRID_DIM0_2D_PREFILL = 32
+    LAUNCH_GRID_DIM0_2D_DECODE = 32
+    LAUNCH_GRID_DIM0_3D_DECODE = 4
+    LAUNCH_GRID_DIM0_3D_REDUCE = 4
+
+    # prefill
+    if num_seqs > num_decodes:
+        kernel_unified_attention_2d[(
+            LAUNCH_GRID_DIM0_2D_PREFILL, #num_q_blocks, 
+            num_kv_heads,
+        )](
+            output_ptr=out,
+            query_ptr=q,
+            key_cache_ptr=k,
+            value_cache_ptr=v,
+            block_tables_ptr=block_table,
+            seq_lens_ptr=seqused_k,
+            alibi_slopes_ptr=alibi_slopes,
+            qq_bias_ptr=qq_bias,
+            scale=softmax_scale,
+            k_scale=k_descale,
+            v_scale=v_descale,
+            softcap=softcap,
+            num_query_heads=num_query_heads,
+            num_queries_per_kv=num_queries_per_kv,
+            block_table_stride=block_table.stride(0),
+            query_stride_0=q.stride(0),
+            query_stride_1=q.stride(1),
+            output_stride_0=out.stride(0),
+            output_stride_1=out.stride(1),
+            qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0,
+            BLOCK_SIZE=block_size,
+            TILE_SIZE=TILE_SIZE_PREFILL,
+            HEAD_SIZE=head_size,
+            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+            USE_ALIBI_SLOPES=use_alibi_slopes,
+            USE_QQ_BIAS=use_qq_bias,
+            USE_SOFTCAP=(softcap > 0),
+            SLIDING_WINDOW=(1 + window_size[0]),
+            stride_k_cache_0=k.stride(0),
+            stride_k_cache_1=k.stride(1),
+            stride_k_cache_2=k.stride(2),
+            stride_k_cache_3=k.stride(3),
+            stride_v_cache_0=v.stride(0),
+            stride_v_cache_1=v.stride(1),
+            stride_v_cache_2=v.stride(2),
+            stride_v_cache_3=v.stride(3),
+            query_start_len_ptr=cu_seqlens_q,
+            BLOCK_Q=BLOCK_Q_PREFILL,
+            num_seqs=num_seqs - num_decodes,
+            seq_idx_offset=num_decodes,
+            BLOCK_M=BLOCK_M_PREFILL,
+            block_q_seq_boundaries_ptr=block_q_seq_boundaries,
+            is_prefill=True,
+            max_q_block_idx=num_q_blocks-1,
+            q_block_iterations=(num_q_blocks + LAUNCH_GRID_DIM0_2D_PREFILL - 1) // LAUNCH_GRID_DIM0_2D_PREFILL
+        )
+
+    # decode
+    if num_decodes > 0:
+        # select between 2d and 3d (split-kv) kernels
+        if not use_split_kv:
+            kernel_unified_attention_2d[(
+                LAUNCH_GRID_DIM0_2D_DECODE, #num_decodes,
+                num_kv_heads,
+            )](
+                output_ptr=out,
+                query_ptr=q,
+                key_cache_ptr=k,
+                value_cache_ptr=v,
+                block_tables_ptr=block_table,
+                seq_lens_ptr=seqused_k,
+                alibi_slopes_ptr=alibi_slopes,
+                qq_bias_ptr=qq_bias,
+                scale=softmax_scale,
+                k_scale=k_descale,
+                v_scale=v_descale,
+                softcap=softcap,
+                num_query_heads=num_query_heads,
+                num_queries_per_kv=num_queries_per_kv,
+                block_table_stride=block_table.stride(0),
+                query_stride_0=q.stride(0),
+                query_stride_1=q.stride(1),
+                output_stride_0=out.stride(0),
+                output_stride_1=out.stride(1),
+                qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0,
+                BLOCK_SIZE=block_size,
+                TILE_SIZE=TILE_SIZE_DECODE,
+                HEAD_SIZE=head_size,
+                HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+                USE_ALIBI_SLOPES=use_alibi_slopes,
+                USE_QQ_BIAS=use_qq_bias,
+                USE_SOFTCAP=(softcap > 0),
+                SLIDING_WINDOW=(1 + window_size[0]),
+                stride_k_cache_0=k.stride(0),
+                stride_k_cache_1=k.stride(1),
+                stride_k_cache_2=k.stride(2),
+                stride_k_cache_3=k.stride(3),
+                stride_v_cache_0=v.stride(0),
+                stride_v_cache_1=v.stride(1),
+                stride_v_cache_2=v.stride(2),
+                stride_v_cache_3=v.stride(3),
+                query_start_len_ptr=cu_seqlens_q,
+                BLOCK_Q=BLOCK_Q_DECODE,
+                num_seqs=num_decodes,
+                seq_idx_offset=0,
+                BLOCK_M=BLOCK_M_DECODE,
+                block_q_seq_boundaries_ptr=None,
+                is_prefill=False,
+                max_q_block_idx=num_decodes-1,
+                q_block_iterations=(num_decodes + LAUNCH_GRID_DIM0_2D_DECODE - 1) // LAUNCH_GRID_DIM0_2D_DECODE
+            )
+        else:
+            # for initial version, NUM_SEGMENTS = 16 is chosen as a default
+            # value that showed good performance in tests
+            NUM_SEGMENTS = 16
+    
+#            segm_output = torch.empty(
+#                num_decodes,
+#                num_query_heads,
+#                NUM_SEGMENTS,
+#                triton.next_power_of_2(head_size),
+#                dtype=torch.float32,
+#                device=q.device,
+#            )
+#            segm_max = torch.empty(
+#                num_decodes,
+#                num_query_heads,
+#                NUM_SEGMENTS,
+#                dtype=torch.float32,
+#                device=q.device,
+#            )
+#            segm_expsum = torch.empty(
+#                num_decodes,
+#                num_query_heads,
+#                NUM_SEGMENTS,
+#                dtype=torch.float32,
+#                device=q.device,
+#            )
+    
+            kernel_unified_attention_3d[(
+                LAUNCH_GRID_DIM0_3D_DECODE, #num_decodes,
+                num_kv_heads,
+                NUM_SEGMENTS
+            )](
+                segm_output_ptr=segm_output,
+                segm_max_ptr=segm_max,
+                segm_expsum_ptr=segm_expsum,
+                query_ptr=q,
+                key_cache_ptr=k,
+                value_cache_ptr=v,
+                block_tables_ptr=block_table,
+                seq_lens_ptr=seqused_k,
+                alibi_slopes_ptr=alibi_slopes,
+                qq_bias_ptr=qq_bias,
+                scale=softmax_scale,
+                k_scale=k_descale,
+                v_scale=v_descale,
+                softcap=softcap,
+                num_query_heads=num_query_heads,
+                num_queries_per_kv=num_queries_per_kv,
+                block_table_stride=block_table.stride(0),
+                query_stride_0=q.stride(0),
+                query_stride_1=q.stride(1),
+                qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0,
+                BLOCK_SIZE=block_size,
+                TILE_SIZE=TILE_SIZE_DECODE,
+                HEAD_SIZE=head_size,
+                HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+                USE_ALIBI_SLOPES=use_alibi_slopes,
+                USE_QQ_BIAS=use_qq_bias,
+                USE_SOFTCAP=(softcap > 0),
+                SLIDING_WINDOW=(1 + window_size[0]),
+                stride_k_cache_0=k.stride(0),
+                stride_k_cache_1=k.stride(1),
+                stride_k_cache_2=k.stride(2),
+                stride_k_cache_3=k.stride(3),
+                stride_v_cache_0=v.stride(0),
+                stride_v_cache_1=v.stride(1),
+                stride_v_cache_2=v.stride(2),
+                stride_v_cache_3=v.stride(3),
+                query_start_len_ptr=cu_seqlens_q,
+                BLOCK_Q=BLOCK_Q_DECODE,
+                num_seqs=num_decodes,
+                BLOCK_M=BLOCK_M_DECODE,
+                NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+                seq_idx_iterations=(num_decodes + LAUNCH_GRID_DIM0_3D_DECODE - 1) // LAUNCH_GRID_DIM0_3D_DECODE
+            )
+            reduce_segments[(
+                LAUNCH_GRID_DIM0_3D_REDUCE, #num_decodes,
+                num_query_heads
+            )](
+                output_ptr=out,
+                segm_output_ptr=segm_output,
+                segm_max_ptr=segm_max,
+                segm_expsum_ptr=segm_expsum,
+                seq_lens_ptr=seqused_k,
+                num_seqs=num_seqs,
+                num_query_heads=num_query_heads,
+                output_stride_0=out.stride(0),
+                output_stride_1=out.stride(1),
+                block_table_stride=block_table.stride(0),
+                TILE_SIZE=TILE_SIZE_DECODE,
+                HEAD_SIZE=head_size,
+                HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+                query_start_len_ptr=cu_seqlens_q,
+                NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
+                seq_idx_iterations=(num_decodes + LAUNCH_GRID_DIM0_3D_REDUCE - 1) // LAUNCH_GRID_DIM0_3D_REDUCE
+            )
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 0f53a537b..6ff5cfab0 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -75,6 +75,8 @@ class Implementation(Enum):
     NT_UNF_TRITON_2D = 18
     NT_UNF_TRITON_AUTO = 19
     UNF_TRITON_2D_TUNED = 20
+    GRID_TRITON_3D = 21
+    GRID_TRITON_2D = 22
 
 
 class BenchmarkMode(Enum):
@@ -1059,9 +1061,15 @@ def test_prefix_vllm_v1_attention(
         Implementation.NT_UNF_TRITON_2D,
         # Implementation.NT_UNF_TRITON_AUTO,
         Implementation.UNF_TRITON_2D_TUNED,
+        Implementation.GRID_TRITON_2D,
+        Implementation.GRID_TRITON_3D,
     ]:
         pytest.skip()
 
+    if implementation == Implementation.GRID_TRITON_3D and decode_share != 1.0:
+        pytest.skip("not supported")
+
+
     if batch_composition == BatchComposition.ALTERNATING and implementation == Implementation.FLASH_ATTN:
         pytest.skip("not supported")
 
@@ -1329,6 +1337,10 @@ def test_prefix_vllm_v1_attention(
             from callers import NewTilesUnifiedTritonAutoAttentionCaller as Caller
         elif implementation == Implementation.UNF_TRITON_2D_TUNED:
             from callers import TunedUnifiedTriton2dAttentionCaller as Caller
+        elif implementation == Implementation.GRID_TRITON_3D:
+            from callers import GridTriton3dAttentionCaller as Caller
+        elif implementation == Implementation.GRID_TRITON_2D:
+            from callers import GridTriton2dAttentionCaller as Caller
 
         if Caller.requires_allocated_output:
             output = torch.empty_like(query)
diff --git a/scripts/callers/__init__.py b/scripts/callers/__init__.py
index 41131a013..35ba469cf 100644
--- a/scripts/callers/__init__.py
+++ b/scripts/callers/__init__.py
@@ -64,3 +64,7 @@
     NewTilesUnifiedTriton3dAttentionCaller,
     NewTilesUnifiedTritonAutoAttentionCaller,
 )
+from .grid_triton import (
+    GridTriton2dAttentionCaller,
+    GridTriton3dAttentionCaller,
+)
diff --git a/scripts/callers/grid_triton.py b/scripts/callers/grid_triton.py
new file mode 100644
index 000000000..8ae898202
--- /dev/null
+++ b/scripts/callers/grid_triton.py
@@ -0,0 +1,193 @@
+#  /*******************************************************************************
+#   * Copyright 2025 IBM Corporation
+#   *
+#   * Licensed under the Apache License, Version 2.0 (the "License");
+#   * you may not use this file except in compliance with the License.
+#   * You may obtain a copy of the License at
+#   *
+#   *     http://www.apache.org/licenses/LICENSE-2.0
+#   *
+#   * Unless required by applicable law or agreed to in writing, software
+#   * distributed under the License is distributed on an "AS IS" BASIS,
+#   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   * See the License for the specific language governing permissions and
+#   * limitations under the License.
+#  *******************************************************************************/
+#
+
+import torch
+import triton
+
+from ibm_triton_lib.kernels import unified_attention_grid
+from .base import PrefixPrefillCaller
+
+
+class GridTriton3dAttentionCaller(PrefixPrefillCaller):
+    @staticmethod
+    def make_call_func(
+        output,
+        query,
+        key_cache,
+        value_cache,
+        key,
+        value,
+        block_tables,
+        seq_lens,
+        ctx_lens,
+        query_lens,
+        start_loc,
+        seq_start_loc,
+        softmax_scale,
+        # kv_cache_dtype,  # unused
+        force_selection=3,
+    ):
+        """
+        query: shape = [num_tokens, num_heads, head_size]
+        key: shape = [num_tokens, num_kv_heads, head_size]
+        value: shape = [num_tokens, num_kv_heads, head_size]
+        k_cache = [num_blocks, block_size, num_kv_heads, head_size]
+        v_cache = [num_blocks, block_size, num_kv_heads, head_size]
+        Returns:
+            shape = [num_tokens, num_heads, head_size]
+        """
+
+        max_query_len = query_lens.max()
+        max_seqlen = seq_lens.max()
+
+        avg_seqlen_q = query_lens.to(torch.float).mean()
+        avg_seqlen_k = seq_lens.to(torch.float).mean()
+    
+        block_size = value.shape[1]
+        num_seqs = len(seq_lens)
+        num_query_heads = query.shape[1]
+        num_kv_heads = key.shape[2]
+        num_queries_per_kv = num_query_heads // num_kv_heads
+        head_size = query.shape[2]
+
+        query_lens = torch.diff(start_loc)
+        if max_query_len == 1:
+            num_decodes = len(seq_lens)
+        else:
+            num_decodes = torch.argmax((query_lens != 1).int()).item()
+        
+        BLOCK_M_PREFILL = 64
+        BLOCK_M_DECODE  = 16
+        BLOCK_Q_PREFILL = BLOCK_M_PREFILL * num_kv_heads // num_query_heads
+        BLOCK_Q_DECODE  = BLOCK_M_DECODE  * num_kv_heads // num_query_heads
+
+        block_q_seq_boundaries = torch.cumsum(torch.cat([torch.tensor([0], dtype=query_lens.dtype, device=query_lens.device), torch.ceil(query_lens[num_decodes:] / BLOCK_Q_PREFILL).to(torch.int)]), dim=0)
+        num_q_blocks = block_q_seq_boundaries[-1].item()
+
+        # use_split_kv = (num_q_blocks * self.num_heads_kv < 128)
+        use_split_kv = force_selection == 3
+
+        NUM_SEGMENTS=16
+
+        if use_split_kv:
+            segm_output = torch.empty(
+                num_decodes,
+                num_query_heads,
+                NUM_SEGMENTS,
+                triton.next_power_of_2(head_size),
+                dtype=torch.float32,
+                device=seq_lens.device,
+            )
+            segm_max = torch.empty(
+                num_decodes,
+                num_query_heads,
+                NUM_SEGMENTS,
+                dtype=torch.float32,
+                device=seq_lens.device,
+            )
+            segm_expsum = torch.empty(
+                num_decodes,
+                num_query_heads,
+                NUM_SEGMENTS,
+                dtype=torch.float32,
+                device=seq_lens.device,
+            )
+        else:
+            segm_output = None
+            segm_max = None
+            segm_expsum = None
+
+        if use_split_kv:
+            assert num_decodes == num_seqs, "3d can only do decodes"
+
+        def call_and_process_output():
+            # k must have shape (num_blocks, page_block_size, num_heads_k, head_size)
+            return unified_attention_grid(
+                q=query,
+                k=key_cache,
+                v=value_cache,
+                out=output,
+                cu_seqlens_q=start_loc,
+                max_seqlen_q=max_query_len,
+                seqused_k=seq_lens,
+                max_seqlen_k=max_seqlen,
+                softmax_scale=softmax_scale,
+                causal=True,
+                window_size=(-1, -1),
+                block_table=block_tables,
+                softcap=0,
+                q_descale=None,
+                k_descale=None,  # TODO?
+                v_descale=None,  # TODO?
+                alibi_slopes=None,
+                use_split_kv=use_split_kv,
+                num_decodes=num_decodes,
+                segm_output=segm_output,
+                segm_max=segm_max,
+                segm_expsum=segm_expsum,
+                BLOCK_M_PREFILL=BLOCK_M_PREFILL,
+                BLOCK_Q_PREFILL=BLOCK_Q_PREFILL,
+                BLOCK_M_DECODE=BLOCK_M_DECODE,
+                BLOCK_Q_DECODE=BLOCK_Q_DECODE,
+                num_q_blocks=num_q_blocks,
+                block_q_seq_boundaries=block_q_seq_boundaries
+            )
+
+        return call_and_process_output
+
+    @staticmethod
+    def requires_allocated_output() -> bool:
+        return True
+
+
+class GridTriton2dAttentionCaller(GridTriton3dAttentionCaller):
+    @staticmethod
+    def make_call_func(
+        output,
+        query,
+        key_cache,
+        value_cache,
+        key,
+        value,
+        block_tables,
+        seq_lens,
+        ctx_lens,
+        query_lens,
+        start_loc,
+        seq_start_loc,
+        softmax_scale,
+        # kv_cache_dtype,  # unused
+        force_selection=2,
+    ):
+
+        return GridTriton3dAttentionCaller.make_call_func(
+            output,
+            query,
+            key_cache,
+            value_cache,
+            key,
+            value,
+            block_tables,
+            seq_lens,
+            ctx_lens,
+            query_lens,
+            start_loc,
+            seq_start_loc,
+            softmax_scale,
+            force_selection=2,
+        )
+
diff --git a/scripts/setups/prefix_grid.conf b/scripts/setups/prefix_grid.conf
new file mode 100644
index 000000000..62b3c05f3
--- /dev/null
+++ b/scripts/setups/prefix_grid.conf
@@ -0,0 +1,35 @@
+BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64, 128]
+# BATCH_SIZES = [4]
+# order:  num_query_heads, num_kv_heads
+NUM_HEADS = [[32, 8]]
+
+SEQUENCE_LENGTHS = [16, 32, 64, 128, 512, 1024, 2048, 4096]
+# SEQUENCE_LENGTHS = [64]
+PREFIX_PREFILL_SHARE_OF_DECODE = [0.0, 0.5, 1.0]
+# PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0, 0.5]
+PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0]
+# PREFIX_PREFILL_BATCH_COMPOSITION = ["DEC_PRE"]
+
+RESERVE_INPUT_TOKEN_LENGTH = ["none", 132096]  
+# RESERVE_INPUT_TOKEN_LENGTH = [132096]  
+# RESERVE_INPUT_TOKEN_LENGTH = ["none"]  
+
+HEAD_SIZES = [128]  # only powers of 2! for llama2 & 3
+BLOCK_SIZES = [16]
+NUM_BLOCKS = [4321]  # "arbitrary values for testing..."
+
+PROMPT_PATTERNS = [[1.0], [0.1, 0.4, 0.5, 1.0, 0.2]]
+# PROMPT_PATTERNS = [[1.0]]
+
+MAX_VALUES = [1.0]
+# BENCHMARK_MODES = ["CUDA_EVENTS"]
+BENCHMARK_MODES = ["CUDA_GRAPHS"]
+
+# IMPLEMENTATION_UT = ["UNF_TRITON_2D_TUNED", "UNF_TRITON_2D_SIMPLE"]
+IMPLEMENTATION_UT = ["GRID_TRITON_2D", "GRID_TRITON_3D"]
+
+# TRITON_BACKEND_DEBUG = 1
+# STORE_TEST_RESULT_PATH=/results
+STORE_TEST_RESULT_PATH=./zrl-triton-results-and-notebooks/micro_benchmarks/raw_data/
+
+TEST_ALLOW_INCORRECT = 1
diff --git a/triton-dejavu b/triton-dejavu
index 3ec45ef42..fe9cc48f2 160000
--- a/triton-dejavu
+++ b/triton-dejavu
@@ -1 +1 @@
-Subproject commit 3ec45ef425aca5bbcd026e2d32f9da6b4981f3c4
+Subproject commit fe9cc48f2b75310a21c722dbe9cb31ab985f85a7

From cce415a2e16ab1554e1b51d58ff7fdc8a400ac1d Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Thu, 14 Aug 2025 09:27:41 -0400
Subject: [PATCH 42/61] further ws tuning for simple kernel; add tuning for new
 grid kernels

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                        | 128 +++++++++++++-
 .../default/cache.json                        |   8 +
 .../kernels/triton_unified_attention_tuned.py |   2 -
 .../kernels/triton_unified_grid.py            | 164 ++++++++++++++++--
 scripts/benchmark.py                          |   2 +-
 scripts/setups/prefix_grid.conf               |   6 +-
 scripts/setups/tune_2d_ws.conf                |   9 +-
 7 files changed, 290 insertions(+), 29 deletions(-)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f52792779faa0af779cada63f2df14c185a5b34f253646e36c07bb8926f93dc8/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-88d41f86261407aa0eaf355d2d650ddaee68bdf62e28c6cc74f4e1bcacddcfd8/default/cache.json

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
index 84d454b13..472c55180 100755
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
@@ -1,6 +1,6 @@
 {
     "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
-    "total_bench_time_s": 21743.59187436104,
+    "total_bench_time_s": 32995.41111779213,
     "evaluated_configs": 2160,
     "keys": [
         "MAX_SEQ_Q",
@@ -74,7 +74,38 @@
         "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
         "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
         "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 3, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 3, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '8', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '16', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '32', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '64', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '128', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '256', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '512', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '1024', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '2048', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1', '4096', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
     },
     "timings": {
         "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
@@ -250,6 +281,99 @@
         ],
         "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
             0.0063226004131138325
+        ],
+        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0069314902648329735
+        ],
+        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.007872514426708221
+        ],
+        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.01249010395258665
+        ],
+        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.08108722418546677
+        ],
+        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.2769642770290375
+        ],
+        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.986293613910675
+        ],
+        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            3.6365156173706055
+        ],
+        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0069512976333498955
+        ],
+        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.007947840727865696
+        ],
+        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.012514323927462101
+        ],
+        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.08159603923559189
+        ],
+        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.2810220718383789
+        ],
+        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.9966282248497009
+        ],
+        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            3.6692380905151367
+        ],
+        "('1', '8', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0068373410031199455
+        ],
+        "('1', '16', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.006867218296974897
+        ],
+        "('1', '32', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.0068841795437037945
+        ],
+        "('1', '64', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.007741984911262989
+        ],
+        "('1', '128', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.01235784962773323
+        ],
+        "('1', '256', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.02117188833653927
+        ],
+        "('1', '512', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.03320121765136719
+        ],
+        "('1', '1024', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.05449502542614937
+        ],
+        "('1', '2048', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.09907654672861099
+        ],
+        "('1', '4096', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.19813136756420135
+        ],
+        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.008454970084130764
+        ],
+        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.014529259875416756
+        ],
+        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.026538236066699028
+        ],
+        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.18360291421413422
+        ],
+        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            0.5871036052703857
+        ],
+        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            2.0788326263427734
+        ],
+        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
+            7.741743564605713
         ]
     },
     "timings_data": {
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f52792779faa0af779cada63f2df14c185a5b34f253646e36c07bb8926f93dc8/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-88d41f86261407aa0eaf355d2d650ddaee68bdf62e28c6cc74f4e1bcacddcfd8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f52792779faa0af779cada63f2df14c185a5b34f253646e36c07bb8926f93dc8/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-88d41f86261407aa0eaf355d2d650ddaee68bdf62e28c6cc74f4e1bcacddcfd8/default/cache.json
new file mode 100755
index 000000000..a4569e066
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f52792779faa0af779cada63f2df14c185a5b34f253646e36c07bb8926f93dc8/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-88d41f86261407aa0eaf355d2d650ddaee68bdf62e28c6cc74f4e1bcacddcfd8/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py
index 04b4944e2..896114a7d 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_attention_tuned.py
@@ -239,8 +239,6 @@ def prefill_heuristics_2d(MAX_SEQ_Q, MAX_SEQ_K, AVG_SEQ_Q, AVG_SEQ_K):
         num_stages=[1, 2, 4, 6, 8],
         # num_consumer_groups=[0, 2, 4],
         # num_buffers_warp_spec=[0, 3, 6],
-        # num_consumer_groups=[2],
-        # num_buffers_warp_spec=[3],
         num_consumer_groups=[2, 4],
         num_buffers_warp_spec=[3, 6],
         conditions=[
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
index c8b2489dd..675c4a1ab 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
@@ -12,6 +12,9 @@
 from vllm.logger import init_logger
 from vllm.triton_utils import tl, triton
 
+import triton_dejavu
+import os
+
 logger = init_logger(__name__)
 
 
@@ -42,6 +45,50 @@ def find_seq_idx(boundary_ptr, target_idx, num_seqs):
     return left - 1
 
 
+@triton_dejavu.autotune(
+    config_space=triton_dejavu.ConfigSpace(
+        {
+            "BLOCK_M": [16, 32, 64, 128, 256, 512],
+            "TILE_SIZE": [16, 32, 64, 128, 256, 512],
+        },
+        num_warps=[2, 4, 8],
+        num_stages=[1, 2, 4, 6, 8],
+        # num_consumer_groups=[0, 2, 4],
+        # num_buffers_warp_spec=[0, 3, 6],
+        # num_consumer_groups=[2, 4],
+        # num_buffers_warp_spec=[3, 6],
+        conditions=[
+            # ensure consistency for ws
+            lambda c: (c.num_consumer_groups !=0 and c.num_buffers_warp_spec != 0) \
+                or (c.num_consumer_groups == 0 and c.num_buffers_warp_spec == 0),
+        ]
+    ),
+    # this list is longer, since it would be used for multiple models
+    key=[
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3",
+        "is_prefill",
+    ],
+    custom_data_storage=os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "dejavu_data")
+    ),
+    use_cuda_graph=True,
+    use_bo=True,
+    search_max_search_t=360,
+    # informed_fallback=informed_fallback_next,
+    # prepare_informed_fallback=prepare_informed_fallback,
+    # fallback_heuristic=fallback_heuristic_dt2,
+    ignore_dtypes=True,
+)
+@triton.heuristics(
+       {"BLOCK_Q": lambda args: args['BLOCK_M'] // args['num_queries_per_kv']},
+)
 @triton.jit
 def kernel_unified_attention_2d(
         output_ptr,  # [num_tokens, num_query_heads, head_size]
@@ -65,7 +112,6 @@ def kernel_unified_attention_2d(
         output_stride_1: tl.int64,  # int, should be equal to head_size
         qq_bias_stride_0: tl.int64,  # int
         BLOCK_SIZE: tl.constexpr,  # int
-        TILE_SIZE: tl.constexpr,  # int must be power of 2
         HEAD_SIZE: tl.constexpr,  # int
         HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
         USE_ALIBI_SLOPES: tl.constexpr,  # bool
@@ -81,14 +127,15 @@ def kernel_unified_attention_2d(
         stride_v_cache_2: tl.int64,  # int
         stride_v_cache_3: tl.constexpr,  # int
         query_start_len_ptr,  # [num_seqs+1]
-        BLOCK_Q: tl.constexpr,  # int
         num_seqs: tl.int32,
         seq_idx_offset,  # int
-        BLOCK_M: tl.constexpr,  # int
         block_q_seq_boundaries_ptr, # [num_prefills] or None
         is_prefill: tl.constexpr,
         max_q_block_idx: tl.int32,  # int
         q_block_iterations: tl.int32,  # int
+        TILE_SIZE: tl.constexpr,  # int must be power of 2
+        BLOCK_Q: tl.constexpr,  # int
+        BLOCK_M: tl.constexpr,  # int
 ):
     if tl.program_id(0) * q_block_iterations > max_q_block_idx:
         return
@@ -291,6 +338,50 @@ def kernel_unified_attention_2d(
         )
 
 
+@triton_dejavu.autotune(
+    config_space=triton_dejavu.ConfigSpace(
+        {
+            "BLOCK_M": [16, 32, 64, 128, 256, 512],
+            "TILE_SIZE": [16, 32, 64, 128, 256, 512],
+        },
+        num_warps=[2, 4, 8],
+        num_stages=[1, 2, 4, 6, 8],
+        # num_consumer_groups=[0, 2, 4],
+        # num_buffers_warp_spec=[0, 3, 6],
+        # num_consumer_groups=[2, 4],
+        # num_buffers_warp_spec=[3, 6],
+        conditions=[
+            # ensure consistency for ws
+            lambda c: (c.num_consumer_groups !=0 and c.num_buffers_warp_spec != 0) \
+                or (c.num_consumer_groups == 0 and c.num_buffers_warp_spec == 0),
+        ]
+    ),
+    # this list is longer, since it would be used for multiple models
+    key=[
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3",
+        "NUM_SEGMENTS_PER_SEQ",
+    ],
+    custom_data_storage=os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "dejavu_data")
+    ),
+    use_cuda_graph=True,
+    use_bo=True,
+    search_max_search_t=360,
+    # informed_fallback=informed_fallback_next,
+    # prepare_informed_fallback=prepare_informed_fallback,
+    # fallback_heuristic=fallback_heuristic_dt2,
+    ignore_dtypes=True,
+)
+@triton.heuristics(
+       {"BLOCK_Q": lambda args: args['BLOCK_M'] // args['num_queries_per_kv']},
+)
 @triton.jit
 def kernel_unified_attention_3d(
         segm_output_ptr,
@@ -315,7 +406,6 @@ def kernel_unified_attention_3d(
         query_stride_1: tl.int64,  # int, should be equal to head_size
         qq_bias_stride_0: tl.int64,  # int
         BLOCK_SIZE: tl.constexpr,  # int
-        TILE_SIZE: tl.constexpr,  # int, must be power of 2
         HEAD_SIZE: tl.constexpr,  # int
         HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
         USE_ALIBI_SLOPES: tl.constexpr,  # bool
@@ -331,11 +421,12 @@ def kernel_unified_attention_3d(
         stride_v_cache_2: tl.int64,  # int
         stride_v_cache_3: tl.constexpr,  # int
         query_start_len_ptr,  # [num_seqs+1]
-        BLOCK_Q: tl.constexpr,  # int
         num_seqs: tl.int32,
-        BLOCK_M: tl.constexpr,  # int
         NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
         seq_idx_iterations: tl.int32,  # int
+        BLOCK_Q: tl.constexpr,  # int
+        TILE_SIZE: tl.constexpr,  # int, must be power of 2
+        BLOCK_M: tl.constexpr,  # int
 ):
     if tl.program_id(0) * seq_idx_iterations >= num_seqs:
         return
@@ -522,6 +613,41 @@ def kernel_unified_attention_3d(
                  mask=query_mask_0 & query_mask_1)
 
 
+@triton_dejavu.autotune(
+    config_space=triton_dejavu.ConfigSpace(
+        {
+            "TILE_SIZE": [16, 32, 64, 128, 256, 512],
+        },
+        num_warps=[2, 4, 8],
+        num_stages=[1, 2, 4, 6, 8],
+        # num_consumer_groups=[0, 2, 4],
+        # num_buffers_warp_spec=[0, 3, 6],
+        # num_consumer_groups=[2, 4],
+        # num_buffers_warp_spec=[3, 6],
+        conditions=[
+            # ensure consistency for ws
+            lambda c: (c.num_consumer_groups !=0 and c.num_buffers_warp_spec != 0) \
+                or (c.num_consumer_groups == 0 and c.num_buffers_warp_spec == 0),
+        ]
+    ),
+    # this list is longer, since it would be used for multiple models
+    key=[
+        "num_query_heads",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "NUM_SEGMENTS_PER_SEQ",
+    ],
+    custom_data_storage=os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "dejavu_data")
+    ),
+    use_cuda_graph=True,
+    use_bo=True,
+    search_max_search_t=360,
+    # informed_fallback=informed_fallback_next,
+    # prepare_informed_fallback=prepare_informed_fallback,
+    # fallback_heuristic=fallback_heuristic_dt2,
+    ignore_dtypes=True,
+)
 @triton.jit
 def reduce_segments(
         output_ptr,  # [num_tokens, num_query_heads, head_size]
@@ -535,12 +661,12 @@ def reduce_segments(
         output_stride_0: tl.int64,  # int
         output_stride_1: tl.int64,  # int, should be equal to head_size
         block_table_stride: tl.int64,  # int
-        TILE_SIZE: tl.constexpr,  # int
         HEAD_SIZE: tl.constexpr,  # int, must be power of 2
         HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
         query_start_len_ptr,  # [num_seqs+1]
         NUM_SEGMENTS_PER_SEQ: tl.constexpr,  # int
         seq_idx_iterations: tl.int32,  # int
+        TILE_SIZE: tl.constexpr,  # int
 ):
     if tl.program_id(0) * seq_idx_iterations >= num_seqs:
         return
@@ -686,7 +812,6 @@ def unified_attention(
             output_stride_1=out.stride(1),
             qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0,
             BLOCK_SIZE=block_size,
-            TILE_SIZE=TILE_SIZE_PREFILL,
             HEAD_SIZE=head_size,
             HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
             USE_ALIBI_SLOPES=use_alibi_slopes,
@@ -702,14 +827,16 @@ def unified_attention(
             stride_v_cache_2=v.stride(2),
             stride_v_cache_3=v.stride(3),
             query_start_len_ptr=cu_seqlens_q,
-            BLOCK_Q=BLOCK_Q_PREFILL,
             num_seqs=num_seqs - num_decodes,
             seq_idx_offset=num_decodes,
-            BLOCK_M=BLOCK_M_PREFILL,
             block_q_seq_boundaries_ptr=block_q_seq_boundaries,
             is_prefill=True,
             max_q_block_idx=num_q_blocks-1,
             q_block_iterations=(num_q_blocks + LAUNCH_GRID_DIM0_2D_PREFILL - 1) // LAUNCH_GRID_DIM0_2D_PREFILL
+            # tunable parameters
+            # BLOCK_M=BLOCK_M_PREFILL,
+            # BLOCK_Q=BLOCK_Q_PREFILL,
+            # TILE_SIZE=TILE_SIZE_PREFILL,
         )
 
     # decode
@@ -741,7 +868,6 @@ def unified_attention(
                 output_stride_1=out.stride(1),
                 qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0,
                 BLOCK_SIZE=block_size,
-                TILE_SIZE=TILE_SIZE_DECODE,
                 HEAD_SIZE=head_size,
                 HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
                 USE_ALIBI_SLOPES=use_alibi_slopes,
@@ -757,14 +883,16 @@ def unified_attention(
                 stride_v_cache_2=v.stride(2),
                 stride_v_cache_3=v.stride(3),
                 query_start_len_ptr=cu_seqlens_q,
-                BLOCK_Q=BLOCK_Q_DECODE,
                 num_seqs=num_decodes,
                 seq_idx_offset=0,
-                BLOCK_M=BLOCK_M_DECODE,
                 block_q_seq_boundaries_ptr=None,
                 is_prefill=False,
                 max_q_block_idx=num_decodes-1,
                 q_block_iterations=(num_decodes + LAUNCH_GRID_DIM0_2D_DECODE - 1) // LAUNCH_GRID_DIM0_2D_DECODE
+                # tunable parameters
+                # BLOCK_M=BLOCK_M_DECODE,
+                # BLOCK_Q=BLOCK_Q_DECODE,
+                # TILE_SIZE=TILE_SIZE_DECODE,
             )
         else:
             # for initial version, NUM_SEGMENTS = 16 is chosen as a default
@@ -820,7 +948,6 @@ def unified_attention(
                 query_stride_1=q.stride(1),
                 qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0,
                 BLOCK_SIZE=block_size,
-                TILE_SIZE=TILE_SIZE_DECODE,
                 HEAD_SIZE=head_size,
                 HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
                 USE_ALIBI_SLOPES=use_alibi_slopes,
@@ -836,11 +963,13 @@ def unified_attention(
                 stride_v_cache_2=v.stride(2),
                 stride_v_cache_3=v.stride(3),
                 query_start_len_ptr=cu_seqlens_q,
-                BLOCK_Q=BLOCK_Q_DECODE,
                 num_seqs=num_decodes,
-                BLOCK_M=BLOCK_M_DECODE,
                 NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
                 seq_idx_iterations=(num_decodes + LAUNCH_GRID_DIM0_3D_DECODE - 1) // LAUNCH_GRID_DIM0_3D_DECODE
+                # tunable parameters
+                # BLOCK_Q=BLOCK_Q_DECODE,
+                # BLOCK_M=BLOCK_M_DECODE,
+                # TILE_SIZE=TILE_SIZE_DECODE,
             )
             reduce_segments[(
                 LAUNCH_GRID_DIM0_3D_REDUCE, #num_decodes,
@@ -856,10 +985,11 @@ def unified_attention(
                 output_stride_0=out.stride(0),
                 output_stride_1=out.stride(1),
                 block_table_stride=block_table.stride(0),
-                TILE_SIZE=TILE_SIZE_DECODE,
                 HEAD_SIZE=head_size,
                 HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
                 query_start_len_ptr=cu_seqlens_q,
                 NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
                 seq_idx_iterations=(num_decodes + LAUNCH_GRID_DIM0_3D_REDUCE - 1) // LAUNCH_GRID_DIM0_3D_REDUCE
+                # tunable parameters
+                # TILE_SIZE=TILE_SIZE_DECODE,
             )
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 6ff5cfab0..6d7881855 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -106,7 +106,7 @@ class BatchComposition(Enum):
 SEQUENCE_LENGTHS = [16, 32, 64, 128, 512, 1024, 2048, 4096]
 PREFIX_PREFILL_SHARE_OF_DECODE = [0.0, 0.5, 1.0]
 PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0, 0.5]
-PREFIX_PREFILL_BATCH_COMPOSITION = [BatchComposition.ALTERNATING]
+PREFIX_PREFILL_BATCH_COMPOSITION = [BatchComposition.DEC_PRE]
 RESERVE_INPUT_TOKEN_LENGTH = [None]
 
 HEAD_SIZES = [128]  # only powers of 2! for llama2 & 3
diff --git a/scripts/setups/prefix_grid.conf b/scripts/setups/prefix_grid.conf
index 62b3c05f3..bba5f879b 100644
--- a/scripts/setups/prefix_grid.conf
+++ b/scripts/setups/prefix_grid.conf
@@ -8,11 +8,11 @@ SEQUENCE_LENGTHS = [16, 32, 64, 128, 512, 1024, 2048, 4096]
 PREFIX_PREFILL_SHARE_OF_DECODE = [0.0, 0.5, 1.0]
 # PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0, 0.5]
 PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0]
-# PREFIX_PREFILL_BATCH_COMPOSITION = ["DEC_PRE"]
+PREFIX_PREFILL_BATCH_COMPOSITION = ["DEC_PRE"]
 
-RESERVE_INPUT_TOKEN_LENGTH = ["none", 132096]  
+# RESERVE_INPUT_TOKEN_LENGTH = ["none", 132096]  
 # RESERVE_INPUT_TOKEN_LENGTH = [132096]  
-# RESERVE_INPUT_TOKEN_LENGTH = ["none"]  
+RESERVE_INPUT_TOKEN_LENGTH = ["none"]  
 
 HEAD_SIZES = [128]  # only powers of 2! for llama2 & 3
 BLOCK_SIZES = [16]
diff --git a/scripts/setups/tune_2d_ws.conf b/scripts/setups/tune_2d_ws.conf
index 5eb1304b5..f2d435a6a 100644
--- a/scripts/setups/tune_2d_ws.conf
+++ b/scripts/setups/tune_2d_ws.conf
@@ -5,10 +5,11 @@ NUM_HEADS = [[32, 8]]
 
 SEQUENCE_LENGTHS = [16, 32, 64, 128, 512, 1024, 2048, 4096]
 # SEQUENCE_LENGTHS = [64]
-PREFIX_PREFILL_SHARE_OF_DECODE = [0.0, 0.5]
-# PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0, 0.5]
-PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0]
-# PREFIX_PREFILL_BATCH_COMPOSITION = ["DEC_PRE"]
+# PREFIX_PREFILL_SHARE_OF_DECODE = [0.0, 0.5]
+PREFIX_PREFILL_SHARE_OF_DECODE = [0.0, 0.5, 1.0]
+PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0, 0.5]
+# PREFIX_PREFILL_SHARE_OF_PARTIAL_PREFILL = [0.0]
+PREFIX_PREFILL_BATCH_COMPOSITION = ["DEC_PRE"]
 
 HEAD_SIZES = [128]  # only powers of 2! for llama2 & 3
 BLOCK_SIZES = [16]

From e221205c9247209c1caa6ddd2e67af72bddbd66f Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Sat, 16 Aug 2025 07:05:41 -0400
Subject: [PATCH 43/61] tuning grid w/o ws

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                        | 36 +++++++++++++++++++
 .../default/cache.json                        | 32 +++++++++++++++++
 .../default/cache.json                        | 27 ++++++++++++++
 triton-dejavu                                 |  2 +-
 4 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-52c92ceef6d420c78c5c5940c8b38fe551467bdabe0ca1810415fbe039359610/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
new file mode 100755
index 000000000..47793d9a0
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
@@ -0,0 +1,36 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
+    "total_bench_time_s": 828.1587612628937,
+    "evaluated_configs": 540,
+    "keys": [
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3",
+        "is_prefill"
+    ],
+    "cache": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
+            0.0039040199480950832
+        ],
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
+            0.0035902990493923426
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
new file mode 100755
index 000000000..81ab50506
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
@@ -0,0 +1,32 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
+    "total_bench_time_s": 360.36944031715393,
+    "evaluated_configs": 540,
+    "keys": [
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3",
+        "NUM_SEGMENTS_PER_SEQ"
+    ],
+    "cache": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
+            0.0035186302848160267
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-52c92ceef6d420c78c5c5940c8b38fe551467bdabe0ca1810415fbe039359610/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-52c92ceef6d420c78c5c5940c8b38fe551467bdabe0ca1810415fbe039359610/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
new file mode 100755
index 000000000..acb692e9e
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-52c92ceef6d420c78c5c5940c8b38fe551467bdabe0ca1810415fbe039359610/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
@@ -0,0 +1,27 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
+    "total_bench_time_s": 102.50655031204224,
+    "evaluated_configs": 90,
+    "keys": [
+        "num_query_heads",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "NUM_SEGMENTS_PER_SEQ"
+    ],
+    "cache": {
+        "('32', '128', '128', '16')": "TILE_SIZE: 16, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '128', '128', '16')": [
+            0.0022160690277814865
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/triton-dejavu b/triton-dejavu
index fe9cc48f2..0a1c1abcd 160000
--- a/triton-dejavu
+++ b/triton-dejavu
@@ -1 +1 @@
-Subproject commit fe9cc48f2b75310a21c722dbe9cb31ab985f85a7
+Subproject commit 0a1c1abcdd5b36faefbad20e608b448afa869269

From 19a73a8721aeb26b740125feae4ef43385a8cceb Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Sat, 16 Aug 2025 07:30:49 -0400
Subject: [PATCH 44/61] grid tuning with ws, preparation

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                               |  8 ++++++++
 .../default/cache.json                               |  8 ++++++++
 .../default/cache.json                               |  8 ++++++++
 .../ibm_triton_lib/kernels/triton_unified_grid.py    | 12 ++++++------
 4 files changed, 30 insertions(+), 6 deletions(-)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-f130aa2e7a5258b0e95f6494e2db37f5dea3ccbb97ee8feed09d2d36599bff88/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-071e784de56797ed9764ebe722a0ebf6c8c9719610c15e34a8b3a8f9fe7252ae/default/cache.json

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
new file mode 100755
index 000000000..5b55f921d
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
new file mode 100755
index 000000000..d3eb13852
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-f130aa2e7a5258b0e95f6494e2db37f5dea3ccbb97ee8feed09d2d36599bff88/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-071e784de56797ed9764ebe722a0ebf6c8c9719610c15e34a8b3a8f9fe7252ae/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-f130aa2e7a5258b0e95f6494e2db37f5dea3ccbb97ee8feed09d2d36599bff88/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-071e784de56797ed9764ebe722a0ebf6c8c9719610c15e34a8b3a8f9fe7252ae/default/cache.json
new file mode 100755
index 000000000..e7d868df2
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-f130aa2e7a5258b0e95f6494e2db37f5dea3ccbb97ee8feed09d2d36599bff88/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-071e784de56797ed9764ebe722a0ebf6c8c9719610c15e34a8b3a8f9fe7252ae/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
index 675c4a1ab..5c8e82d78 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
@@ -55,8 +55,8 @@ def find_seq_idx(boundary_ptr, target_idx, num_seqs):
         num_stages=[1, 2, 4, 6, 8],
         # num_consumer_groups=[0, 2, 4],
         # num_buffers_warp_spec=[0, 3, 6],
-        # num_consumer_groups=[2, 4],
-        # num_buffers_warp_spec=[3, 6],
+        num_consumer_groups=[2, 4],
+        num_buffers_warp_spec=[3, 6],
         conditions=[
             # ensure consistency for ws
             lambda c: (c.num_consumer_groups !=0 and c.num_buffers_warp_spec != 0) \
@@ -348,8 +348,8 @@ def kernel_unified_attention_2d(
         num_stages=[1, 2, 4, 6, 8],
         # num_consumer_groups=[0, 2, 4],
         # num_buffers_warp_spec=[0, 3, 6],
-        # num_consumer_groups=[2, 4],
-        # num_buffers_warp_spec=[3, 6],
+        num_consumer_groups=[2, 4],
+        num_buffers_warp_spec=[3, 6],
         conditions=[
             # ensure consistency for ws
             lambda c: (c.num_consumer_groups !=0 and c.num_buffers_warp_spec != 0) \
@@ -622,8 +622,8 @@ def kernel_unified_attention_3d(
         num_stages=[1, 2, 4, 6, 8],
         # num_consumer_groups=[0, 2, 4],
         # num_buffers_warp_spec=[0, 3, 6],
-        # num_consumer_groups=[2, 4],
-        # num_buffers_warp_spec=[3, 6],
+        num_consumer_groups=[2, 4],
+        num_buffers_warp_spec=[3, 6],
         conditions=[
             # ensure consistency for ws
             lambda c: (c.num_consumer_groups !=0 and c.num_buffers_warp_spec != 0) \

From 16d64d83b9703055bdbf3c66475da509564254b9 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Sat, 16 Aug 2025 08:23:45 -0400
Subject: [PATCH 45/61] tuning forcing ws (partially failing)

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                        | 38 ++++++++++++++++---
 .../default/cache.json                        | 34 ++++++++++++++---
 2 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
index 5b55f921d..870c8b475 100755
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
@@ -1,8 +1,36 @@
 {
     "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
+    "total_bench_time_s": 863.3593587875366,
+    "evaluated_configs": 2160,
+    "keys": [
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3",
+        "is_prefill"
+    ],
+    "cache": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
+            0.007799518760293722
+        ],
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
+            0.006862994749099016
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
 }
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
index d3eb13852..12932629d 100755
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
@@ -1,8 +1,32 @@
 {
     "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
+    "total_bench_time_s": 368.8641257286072,
+    "evaluated_configs": 2160,
+    "keys": [
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3",
+        "NUM_SEGMENTS_PER_SEQ"
+    ],
+    "cache": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
+            0.003861392615363002
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
 }
\ No newline at end of file

From 2057fa4fe2c7a60dc69a4fa432eca7219e66e3c1 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Sat, 16 Aug 2025 08:34:16 -0400
Subject: [PATCH 46/61] allowing ws or not

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                        |  8 +++++++
 .../default/cache.json                        |  8 +++++++
 .../default/cache.json                        |  8 +++++++
 .../kernels/triton_unified_grid.py            | 24 +++++++++----------
 4 files changed, 36 insertions(+), 12 deletions(-)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
new file mode 100755
index 000000000..5b55f921d
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
new file mode 100755
index 000000000..d3eb13852
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
new file mode 100755
index 000000000..e7d868df2
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
index 5c8e82d78..07e64942b 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
@@ -53,10 +53,10 @@ def find_seq_idx(boundary_ptr, target_idx, num_seqs):
         },
         num_warps=[2, 4, 8],
         num_stages=[1, 2, 4, 6, 8],
-        # num_consumer_groups=[0, 2, 4],
-        # num_buffers_warp_spec=[0, 3, 6],
-        num_consumer_groups=[2, 4],
-        num_buffers_warp_spec=[3, 6],
+        num_consumer_groups=[0, 2, 4, 8],
+        num_buffers_warp_spec=[0, 3, 6, 9],
+        # num_consumer_groups=[2, 4],
+        # num_buffers_warp_spec=[3, 6],
         conditions=[
             # ensure consistency for ws
             lambda c: (c.num_consumer_groups !=0 and c.num_buffers_warp_spec != 0) \
@@ -346,10 +346,10 @@ def kernel_unified_attention_2d(
         },
         num_warps=[2, 4, 8],
         num_stages=[1, 2, 4, 6, 8],
-        # num_consumer_groups=[0, 2, 4],
-        # num_buffers_warp_spec=[0, 3, 6],
-        num_consumer_groups=[2, 4],
-        num_buffers_warp_spec=[3, 6],
+        num_consumer_groups=[0, 2, 4, 8],
+        num_buffers_warp_spec=[0, 3, 6, 9],
+        # num_consumer_groups=[2, 4],
+        # num_buffers_warp_spec=[3, 6],
         conditions=[
             # ensure consistency for ws
             lambda c: (c.num_consumer_groups !=0 and c.num_buffers_warp_spec != 0) \
@@ -620,10 +620,10 @@ def kernel_unified_attention_3d(
         },
         num_warps=[2, 4, 8],
         num_stages=[1, 2, 4, 6, 8],
-        # num_consumer_groups=[0, 2, 4],
-        # num_buffers_warp_spec=[0, 3, 6],
-        num_consumer_groups=[2, 4],
-        num_buffers_warp_spec=[3, 6],
+        num_consumer_groups=[0, 2, 4, 8],
+        num_buffers_warp_spec=[0, 3, 6, 9],
+        # num_consumer_groups=[2, 4],
+        # num_buffers_warp_spec=[3, 6],
         conditions=[
             # ensure consistency for ws
             lambda c: (c.num_consumer_groups !=0 and c.num_buffers_warp_spec != 0) \

From ffe0346edd97c2a6a29889e10fdeef9421795804 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Sat, 16 Aug 2025 11:14:58 -0400
Subject: [PATCH 47/61] tuning with wrong ws config

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                        | 38 ++++++++++++++++---
 .../default/cache.json                        | 34 ++++++++++++++---
 .../default/cache.json                        | 29 +++++++++++---
 .../kernels/triton_unified_grid.py            |  6 +--
 triton-dejavu                                 |  2 +-
 5 files changed, 90 insertions(+), 19 deletions(-)

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
index 5b55f921d..87360ce3e 100755
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
@@ -1,8 +1,36 @@
 {
     "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
+    "total_bench_time_s": 859.6228244304657,
+    "evaluated_configs": 5400,
+    "keys": [
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3",
+        "is_prefill"
+    ],
+    "cache": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 32, TILE_SIZE: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 9, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 32, TILE_SIZE: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 9, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
+            0.007184021640568972
+        ],
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
+            0.006555985659360886
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
 }
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
index d3eb13852..02018ed3d 100755
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
@@ -1,8 +1,32 @@
 {
     "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
+    "total_bench_time_s": 362.4042990207672,
+    "evaluated_configs": 5400,
+    "keys": [
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3",
+        "NUM_SEGMENTS_PER_SEQ"
+    ],
+    "cache": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 9, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
+            0.0031293570064008236
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
 }
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
index e7d868df2..e30476d4b 100755
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
@@ -1,8 +1,27 @@
 {
     "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
+    "total_bench_time_s": 360.16377663612366,
+    "evaluated_configs": 900,
+    "keys": [
+        "num_query_heads",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "NUM_SEGMENTS_PER_SEQ"
+    ],
+    "cache": {
+        "('32', '128', '128', '16')": "TILE_SIZE: 32, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '128', '128', '16')": [
+            0.0031249839812517166
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
 }
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
index 07e64942b..80e1a391e 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
@@ -59,7 +59,7 @@ def find_seq_idx(boundary_ptr, target_idx, num_seqs):
         # num_buffers_warp_spec=[3, 6],
         conditions=[
             # ensure consistency for ws
-            lambda c: (c.num_consumer_groups !=0 and c.num_buffers_warp_spec != 0) \
+            lambda c: (c.num_consumer_groups != 0 and c.num_buffers_warp_spec != 0) \
                 or (c.num_consumer_groups == 0 and c.num_buffers_warp_spec == 0),
         ]
     ),
@@ -352,7 +352,7 @@ def kernel_unified_attention_2d(
         # num_buffers_warp_spec=[3, 6],
         conditions=[
             # ensure consistency for ws
-            lambda c: (c.num_consumer_groups !=0 and c.num_buffers_warp_spec != 0) \
+            lambda c: (c.num_consumer_groups != 0 and c.num_buffers_warp_spec != 0) \
                 or (c.num_consumer_groups == 0 and c.num_buffers_warp_spec == 0),
         ]
     ),
@@ -626,7 +626,7 @@ def kernel_unified_attention_3d(
         # num_buffers_warp_spec=[3, 6],
         conditions=[
             # ensure consistency for ws
-            lambda c: (c.num_consumer_groups !=0 and c.num_buffers_warp_spec != 0) \
+            lambda c: (c.num_consumer_groups != 0 and c.num_buffers_warp_spec != 0) \
                 or (c.num_consumer_groups == 0 and c.num_buffers_warp_spec == 0),
         ]
     ),
diff --git a/triton-dejavu b/triton-dejavu
index 0a1c1abcd..3f3e9a194 160000
--- a/triton-dejavu
+++ b/triton-dejavu
@@ -1 +1 @@
-Subproject commit 0a1c1abcdd5b36faefbad20e608b448afa869269
+Subproject commit 3f3e9a1940545f8d01476b464d9e8bea73972d79

From d7576512510104f14211f61f55ea26ae1650f41b Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Sat, 16 Aug 2025 12:59:32 -0400
Subject: [PATCH 48/61] autotuning with the right bohb spaces

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                        | 36 +++++++++++++++++++
 .../default/cache.json                        | 32 +++++++++++++++++
 .../default/cache.json                        | 27 ++++++++++++++
 .../kernels/triton_unified_grid.py            |  9 +++--
 triton-dejavu                                 |  2 +-
 5 files changed, 102 insertions(+), 4 deletions(-)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3a6fc1c46225b2f7d0bc848adf5344e3dda28dcbb0957584ee22138ce6625218/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
new file mode 100755
index 000000000..17a69de08
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
@@ -0,0 +1,36 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
+    "total_bench_time_s": 1721.1768200397491,
+    "evaluated_configs": 5400,
+    "keys": [
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3",
+        "is_prefill"
+    ],
+    "cache": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
+            0.004668071866035461
+        ],
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
+            0.0035326406359672546
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
new file mode 100755
index 000000000..6b8ebea6e
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
@@ -0,0 +1,32 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
+    "total_bench_time_s": 720.5651552677155,
+    "evaluated_configs": 5400,
+    "keys": [
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3",
+        "NUM_SEGMENTS_PER_SEQ"
+    ],
+    "cache": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 32, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
+            0.003578872187063098
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3a6fc1c46225b2f7d0bc848adf5344e3dda28dcbb0957584ee22138ce6625218/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3a6fc1c46225b2f7d0bc848adf5344e3dda28dcbb0957584ee22138ce6625218/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
new file mode 100755
index 000000000..d53f63026
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3a6fc1c46225b2f7d0bc848adf5344e3dda28dcbb0957584ee22138ce6625218/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
@@ -0,0 +1,27 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
+    "total_bench_time_s": 367.19957637786865,
+    "evaluated_configs": 900,
+    "keys": [
+        "num_query_heads",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "NUM_SEGMENTS_PER_SEQ"
+    ],
+    "cache": {
+        "('32', '128', '128', '16')": "TILE_SIZE: 32, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '128', '128', '16')": [
+            0.0031237052753567696
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
index 80e1a391e..40896aa85 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
@@ -80,7 +80,8 @@ def find_seq_idx(boundary_ptr, target_idx, num_seqs):
     ),
     use_cuda_graph=True,
     use_bo=True,
-    search_max_search_t=360,
+    # search_max_search_t=360,
+    search_max_search_t=720,
     # informed_fallback=informed_fallback_next,
     # prepare_informed_fallback=prepare_informed_fallback,
     # fallback_heuristic=fallback_heuristic_dt2,
@@ -373,7 +374,8 @@ def kernel_unified_attention_2d(
     ),
     use_cuda_graph=True,
     use_bo=True,
-    search_max_search_t=360,
+    # search_max_search_t=360,
+    search_max_search_t=720,
     # informed_fallback=informed_fallback_next,
     # prepare_informed_fallback=prepare_informed_fallback,
     # fallback_heuristic=fallback_heuristic_dt2,
@@ -642,7 +644,8 @@ def kernel_unified_attention_3d(
     ),
     use_cuda_graph=True,
     use_bo=True,
-    search_max_search_t=360,
+    # search_max_search_t=360,
+    search_max_search_t=720,
     # informed_fallback=informed_fallback_next,
     # prepare_informed_fallback=prepare_informed_fallback,
     # fallback_heuristic=fallback_heuristic_dt2,
diff --git a/triton-dejavu b/triton-dejavu
index 3f3e9a194..18250db44 160000
--- a/triton-dejavu
+++ b/triton-dejavu
@@ -1 +1 @@
-Subproject commit 3f3e9a1940545f8d01476b464d9e8bea73972d79
+Subproject commit 18250db440141d6f3b136cb396e8061c0d9baad1

From ebc7a41506bc2d84683b1106d3f5245038eba2d8 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Sat, 16 Aug 2025 13:51:13 -0400
Subject: [PATCH 49/61] preparing to run without bo

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                               |  8 ++++++++
 .../default/cache.json                               |  8 ++++++++
 .../default/cache.json                               |  8 ++++++++
 .../ibm_triton_lib/kernels/triton_unified_grid.py    | 12 ++++++------
 4 files changed, 30 insertions(+), 6 deletions(-)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
new file mode 100755
index 000000000..5b55f921d
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
new file mode 100755
index 000000000..d3eb13852
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
new file mode 100755
index 000000000..e7d868df2
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
index 40896aa85..2103e5a4f 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
@@ -79,9 +79,9 @@ def find_seq_idx(boundary_ptr, target_idx, num_seqs):
         os.path.join(os.path.dirname(__file__), "dejavu_data")
     ),
     use_cuda_graph=True,
-    use_bo=True,
+    # use_bo=True,
     # search_max_search_t=360,
-    search_max_search_t=720,
+    # search_max_search_t=720,
     # informed_fallback=informed_fallback_next,
     # prepare_informed_fallback=prepare_informed_fallback,
     # fallback_heuristic=fallback_heuristic_dt2,
@@ -373,9 +373,9 @@ def kernel_unified_attention_2d(
         os.path.join(os.path.dirname(__file__), "dejavu_data")
     ),
     use_cuda_graph=True,
-    use_bo=True,
+    # use_bo=True,
     # search_max_search_t=360,
-    search_max_search_t=720,
+    # search_max_search_t=720,
     # informed_fallback=informed_fallback_next,
     # prepare_informed_fallback=prepare_informed_fallback,
     # fallback_heuristic=fallback_heuristic_dt2,
@@ -643,9 +643,9 @@ def kernel_unified_attention_3d(
         os.path.join(os.path.dirname(__file__), "dejavu_data")
     ),
     use_cuda_graph=True,
-    use_bo=True,
+    # use_bo=True,
     # search_max_search_t=360,
-    search_max_search_t=720,
+    # search_max_search_t=720,
     # informed_fallback=informed_fallback_next,
     # prepare_informed_fallback=prepare_informed_fallback,
     # fallback_heuristic=fallback_heuristic_dt2,

From 41120d7afe68cd2ea231ad4f36cf4055544e2078 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Sun, 17 Aug 2025 13:10:06 -0400
Subject: [PATCH 50/61] switching to random search

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../ibm_triton_lib/kernels/triton_unified_grid.py         | 6 ++++++
 triton-dejavu                                             | 2 +-
 5 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-5c087adac96d09b2060f573486a99205cda08f58e544b9acfd14918832e2e582/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
new file mode 100755
index 000000000..5b55f921d
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
new file mode 100755
index 000000000..d3eb13852
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-5c087adac96d09b2060f573486a99205cda08f58e544b9acfd14918832e2e582/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-5c087adac96d09b2060f573486a99205cda08f58e544b9acfd14918832e2e582/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
new file mode 100755
index 000000000..e7d868df2
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-5c087adac96d09b2060f573486a99205cda08f58e544b9acfd14918832e2e582/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
index 2103e5a4f..31dd911c4 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
@@ -82,6 +82,8 @@ def find_seq_idx(boundary_ptr, target_idx, num_seqs):
     # use_bo=True,
     # search_max_search_t=360,
     # search_max_search_t=720,
+    use_random_search=True,
+    search_max_search_t=1800,
     # informed_fallback=informed_fallback_next,
     # prepare_informed_fallback=prepare_informed_fallback,
     # fallback_heuristic=fallback_heuristic_dt2,
@@ -376,6 +378,8 @@ def kernel_unified_attention_2d(
     # use_bo=True,
     # search_max_search_t=360,
     # search_max_search_t=720,
+    use_random_search=True,
+    search_max_search_t=1800,
     # informed_fallback=informed_fallback_next,
     # prepare_informed_fallback=prepare_informed_fallback,
     # fallback_heuristic=fallback_heuristic_dt2,
@@ -646,6 +650,8 @@ def kernel_unified_attention_3d(
     # use_bo=True,
     # search_max_search_t=360,
     # search_max_search_t=720,
+    use_random_search=True,
+    search_max_search_t=1800,
     # informed_fallback=informed_fallback_next,
     # prepare_informed_fallback=prepare_informed_fallback,
     # fallback_heuristic=fallback_heuristic_dt2,
diff --git a/triton-dejavu b/triton-dejavu
index 18250db44..27247f94b 160000
--- a/triton-dejavu
+++ b/triton-dejavu
@@ -1 +1 @@
-Subproject commit 18250db440141d6f3b136cb396e8061c0d9baad1
+Subproject commit 27247f94b10aecc95799e439cb96ff6c99097a32

From adc8ccf045c0a5c01834423fd8004526a0ec5465 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Mon, 18 Aug 2025 07:23:27 -0400
Subject: [PATCH 51/61] switching back to bo

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                               |  8 ++++++++
 .../default/cache.json                               |  8 ++++++++
 .../default/cache.json                               |  8 ++++++++
 .../ibm_triton_lib/kernels/triton_unified_grid.py    | 12 ++++++------
 4 files changed, 30 insertions(+), 6 deletions(-)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-72dc6d55a572ac899f3da4b41257cc6aeb8cad69a0fc94b16aa73ca9c82b4012/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
new file mode 100755
index 000000000..5b55f921d
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
new file mode 100755
index 000000000..d3eb13852
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-72dc6d55a572ac899f3da4b41257cc6aeb8cad69a0fc94b16aa73ca9c82b4012/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-72dc6d55a572ac899f3da4b41257cc6aeb8cad69a0fc94b16aa73ca9c82b4012/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
new file mode 100755
index 000000000..e7d868df2
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-72dc6d55a572ac899f3da4b41257cc6aeb8cad69a0fc94b16aa73ca9c82b4012/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
index 31dd911c4..dded6c710 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
@@ -79,10 +79,10 @@ def find_seq_idx(boundary_ptr, target_idx, num_seqs):
         os.path.join(os.path.dirname(__file__), "dejavu_data")
     ),
     use_cuda_graph=True,
-    # use_bo=True,
+    use_bo=True,
     # search_max_search_t=360,
     # search_max_search_t=720,
-    use_random_search=True,
+    # use_random_search=True,
     search_max_search_t=1800,
     # informed_fallback=informed_fallback_next,
     # prepare_informed_fallback=prepare_informed_fallback,
@@ -375,10 +375,10 @@ def kernel_unified_attention_2d(
         os.path.join(os.path.dirname(__file__), "dejavu_data")
     ),
     use_cuda_graph=True,
-    # use_bo=True,
+    use_bo=True,
     # search_max_search_t=360,
     # search_max_search_t=720,
-    use_random_search=True,
+    # use_random_search=True,
     search_max_search_t=1800,
     # informed_fallback=informed_fallback_next,
     # prepare_informed_fallback=prepare_informed_fallback,
@@ -647,10 +647,10 @@ def kernel_unified_attention_3d(
         os.path.join(os.path.dirname(__file__), "dejavu_data")
     ),
     use_cuda_graph=True,
-    # use_bo=True,
+    use_bo=True,
     # search_max_search_t=360,
     # search_max_search_t=720,
-    use_random_search=True,
+    # use_random_search=True,
     search_max_search_t=1800,
     # informed_fallback=informed_fallback_next,
     # prepare_informed_fallback=prepare_informed_fallback,

From 7eb1410b5d4bbceed5c5da45cfeaf1cc8bcae3e4 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Mon, 18 Aug 2025 12:55:24 -0400
Subject: [PATCH 52/61] another run without ws

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../ibm_triton_lib/kernels/__init__.py        |  2 +-
 .../default/cache.json                        |  8 ++++
 .../default/cache.json                        |  8 ++++
 .../default/cache.json                        |  8 ++++
 .../kernels/triton_unified_grid.py            | 46 +++++++++----------
 vllm                                          |  2 +-
 6 files changed, 49 insertions(+), 25 deletions(-)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py b/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
index 8332a82b5..722e0507b 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/__init__.py
@@ -74,4 +74,4 @@ def ConfigSpace(
 
 from .mamba_ssm import selective_state_update
 
-from .fused_moe import fused_moe
+# from .fused_moe import fused_moe
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
new file mode 100755
index 000000000..5b55f921d
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
new file mode 100755
index 000000000..d3eb13852
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
new file mode 100755
index 000000000..e7d868df2
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
index dded6c710..936bf5a5a 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_grid.py
@@ -53,15 +53,15 @@ def find_seq_idx(boundary_ptr, target_idx, num_seqs):
         },
         num_warps=[2, 4, 8],
         num_stages=[1, 2, 4, 6, 8],
-        num_consumer_groups=[0, 2, 4, 8],
-        num_buffers_warp_spec=[0, 3, 6, 9],
+        # num_consumer_groups=[0, 2, 4, 8],
+        # num_buffers_warp_spec=[0, 3, 6, 9],
         # num_consumer_groups=[2, 4],
         # num_buffers_warp_spec=[3, 6],
-        conditions=[
-            # ensure consistency for ws
-            lambda c: (c.num_consumer_groups != 0 and c.num_buffers_warp_spec != 0) \
-                or (c.num_consumer_groups == 0 and c.num_buffers_warp_spec == 0),
-        ]
+        # conditions=[
+        #     # ensure consistency for ws
+        #     lambda c: (c.num_consumer_groups != 0 and c.num_buffers_warp_spec != 0) \
+        #         or (c.num_consumer_groups == 0 and c.num_buffers_warp_spec == 0),
+        # ]
     ),
     # this list is longer, since it would be used for multiple models
     key=[
@@ -349,15 +349,15 @@ def kernel_unified_attention_2d(
         },
         num_warps=[2, 4, 8],
         num_stages=[1, 2, 4, 6, 8],
-        num_consumer_groups=[0, 2, 4, 8],
-        num_buffers_warp_spec=[0, 3, 6, 9],
+        # num_consumer_groups=[0, 2, 4, 8],
+        # num_buffers_warp_spec=[0, 3, 6, 9],
         # num_consumer_groups=[2, 4],
         # num_buffers_warp_spec=[3, 6],
-        conditions=[
-            # ensure consistency for ws
-            lambda c: (c.num_consumer_groups != 0 and c.num_buffers_warp_spec != 0) \
-                or (c.num_consumer_groups == 0 and c.num_buffers_warp_spec == 0),
-        ]
+        # conditions=[
+        #     # ensure consistency for ws
+        #     lambda c: (c.num_consumer_groups != 0 and c.num_buffers_warp_spec != 0) \
+        #         or (c.num_consumer_groups == 0 and c.num_buffers_warp_spec == 0),
+        # ]
     ),
     # this list is longer, since it would be used for multiple models
     key=[
@@ -626,15 +626,15 @@ def kernel_unified_attention_3d(
         },
         num_warps=[2, 4, 8],
         num_stages=[1, 2, 4, 6, 8],
-        num_consumer_groups=[0, 2, 4, 8],
-        num_buffers_warp_spec=[0, 3, 6, 9],
-        # num_consumer_groups=[2, 4],
-        # num_buffers_warp_spec=[3, 6],
-        conditions=[
-            # ensure consistency for ws
-            lambda c: (c.num_consumer_groups != 0 and c.num_buffers_warp_spec != 0) \
-                or (c.num_consumer_groups == 0 and c.num_buffers_warp_spec == 0),
-        ]
+        # num_consumer_groups=[0, 2, 4, 8],
+        # num_buffers_warp_spec=[0, 3, 6, 9],
+        # # num_consumer_groups=[2, 4],
+        # # num_buffers_warp_spec=[3, 6],
+        # conditions=[
+        #     # ensure consistency for ws
+        #     lambda c: (c.num_consumer_groups != 0 and c.num_buffers_warp_spec != 0) \
+        #         or (c.num_consumer_groups == 0 and c.num_buffers_warp_spec == 0),
+        # ]
     ),
     # this list is longer, since it would be used for multiple models
     key=[
diff --git a/vllm b/vllm
index d7cc6ee33..838864288 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit d7cc6ee330d93b0398f3ead75ab779d8a7a1042f
+Subproject commit 8388642880bb6fc2f581c839ff8ac216acdeb380

From 64d6d33ca7c16dd9eb23fb32d3c43f260cfb0b38 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Mon, 18 Aug 2025 20:29:03 +0000
Subject: [PATCH 53/61] preparing tuning on MI300

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 12 files changed, 96 insertions(+)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_6.3.1/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-48e3cb6cd6592d4b55826bce9ff39781f5f8d3beec28e171da3dd4e5109ad732/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_6.3.1/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_6.3.1/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
new file mode 100755
index 000000000..2540ac5c3
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_6.3.1/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
new file mode 100755
index 000000000..c2b3452bf
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
new file mode 100755
index 000000000..c2b3452bf
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
new file mode 100755
index 000000000..2540ac5c3
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
new file mode 100755
index 000000000..2540ac5c3
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-48e3cb6cd6592d4b55826bce9ff39781f5f8d3beec28e171da3dd4e5109ad732/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-48e3cb6cd6592d4b55826bce9ff39781f5f8d3beec28e171da3dd4e5109ad732/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
new file mode 100755
index 000000000..a7c2af725
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-48e3cb6cd6592d4b55826bce9ff39781f5f8d3beec28e171da3dd4e5109ad732/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
new file mode 100755
index 000000000..a4569e066
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
new file mode 100755
index 000000000..5b55f921d
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
new file mode 100755
index 000000000..a7c2af725
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
new file mode 100755
index 000000000..a7c2af725
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
new file mode 100755
index 000000000..d3eb13852
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
new file mode 100755
index 000000000..e7d868df2
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file

From c68dd28de601f026a5c5824dfd511d704de91575 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Tue, 19 Aug 2025 02:39:34 -0400
Subject: [PATCH 54/61] tuning 30min on H100

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                        | 38 ++++++++++++++++---
 .../default/cache.json                        | 34 ++++++++++++++---
 .../default/cache.json                        | 29 +++++++++++---
 triton-dejavu                                 |  2 +-
 vllm                                          |  2 +-
 5 files changed, 88 insertions(+), 17 deletions(-)

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
index 5b55f921d..710e7b803 100755
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
@@ -1,8 +1,36 @@
 {
     "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
+    "total_bench_time_s": 2846.828315258026,
+    "evaluated_configs": 540,
+    "keys": [
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3",
+        "is_prefill"
+    ],
+    "cache": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
+            0.003479903331026435
+        ],
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
+            0.003208082402125001
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
 }
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
index d3eb13852..368c26881 100755
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
@@ -1,8 +1,32 @@
 {
     "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
+    "total_bench_time_s": 1181.724599123001,
+    "evaluated_configs": 540,
+    "keys": [
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3",
+        "NUM_SEGMENTS_PER_SEQ"
+    ],
+    "cache": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
+            0.0031476265285164118
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
 }
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
index e7d868df2..4230de538 100755
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
@@ -1,8 +1,27 @@
 {
     "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
+    "total_bench_time_s": 77.74497675895691,
+    "evaluated_configs": 90,
+    "keys": [
+        "num_query_heads",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "NUM_SEGMENTS_PER_SEQ"
+    ],
+    "cache": {
+        "('32', '128', '128', '16')": "TILE_SIZE: 32, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '128', '128', '16')": [
+            0.002219553105533123
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
 }
\ No newline at end of file
diff --git a/triton-dejavu b/triton-dejavu
index 27247f94b..5c7d4fa99 160000
--- a/triton-dejavu
+++ b/triton-dejavu
@@ -1 +1 @@
-Subproject commit 27247f94b10aecc95799e439cb96ff6c99097a32
+Subproject commit 5c7d4fa9915134d1ce12c4e244015ee705cd5df3
diff --git a/vllm b/vllm
index 838864288..8b1c0ff03 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit 8388642880bb6fc2f581c839ff8ac216acdeb380
+Subproject commit 8b1c0ff03130ba7b22f15e61b0bf68da0f30a3ca

From 429ddd504b386587944b580c746a214932e8b583 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Tue, 19 Aug 2025 06:41:33 +0000
Subject: [PATCH 55/61] tuning 30min MI300

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                        | 36 +++++++++++++++++++
 .../default/cache.json                        | 32 +++++++++++++++++
 .../default/cache.json                        | 27 ++++++++++++++
 3 files changed, 95 insertions(+)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
new file mode 100755
index 000000000..de8c75698
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
@@ -0,0 +1,36 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
+    "total_bench_time_s": 3830.64182972908,
+    "evaluated_configs": 540,
+    "keys": [
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3",
+        "is_prefill"
+    ],
+    "cache": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 32, TILE_SIZE: 16, num_warps: 8, num_ctas: 1, num_stages: 6, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 32, TILE_SIZE: 16, num_warps: 8, num_ctas: 1, num_stages: 6, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
+            0.00517149455845356
+        ],
+        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
+            0.00435659708455205
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
new file mode 100755
index 000000000..901033d5b
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
@@ -0,0 +1,32 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
+    "total_bench_time_s": 1805.8680896759033,
+    "evaluated_configs": 540,
+    "keys": [
+        "num_query_heads",
+        "num_queries_per_kv",
+        "BLOCK_SIZE",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "SLIDING_WINDOW",
+        "stride_k_cache_3",
+        "stride_v_cache_3",
+        "NUM_SEGMENTS_PER_SEQ"
+    ],
+    "cache": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
+            0.003383171046152711
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
new file mode 100755
index 000000000..165560713
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
@@ -0,0 +1,27 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
+    "total_bench_time_s": 115.25903606414795,
+    "evaluated_configs": 90,
+    "keys": [
+        "num_query_heads",
+        "HEAD_SIZE",
+        "HEAD_SIZE_PADDED",
+        "NUM_SEGMENTS_PER_SEQ"
+    ],
+    "cache": {
+        "('32', '128', '128', '16')": "TILE_SIZE: 128, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
+    },
+    "timings": {
+        "('32', '128', '128', '16')": [
+            0.0028324048034846783
+        ]
+    },
+    "timings_data": {
+        "labels": [
+            "ms"
+        ],
+        "rep_t_ms": 100,
+        "warmup_t_ms": 25,
+        "cuda_graphs": true
+    }
+}
\ No newline at end of file

From 40f0dfbd8cf3d835cdccc91a7872a29ccae514c8 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 20 Aug 2025 08:49:07 -0400
Subject: [PATCH 56/61] baseline experiments

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../triton_chunked_prefill_paged_decode.py    |  5 +-
 .../triton_paged_decode_attention_2d.py       | 63 ++++++++++---------
 scripts/setups/prefix_grid.conf               |  7 ++-
 3 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/legacy/triton_chunked_prefill_paged_decode.py b/ibm-triton-lib/ibm_triton_lib/kernels/legacy/triton_chunked_prefill_paged_decode.py
index f09b6ba9f..beb911396 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/legacy/triton_chunked_prefill_paged_decode.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/legacy/triton_chunked_prefill_paged_decode.py
@@ -102,16 +102,17 @@ def chunked_prefill_paged_decode(
         HEAD_SIZE_PADDED=next_power_of_2(head_size),
         USE_ALIBI_SLOPES=use_alibi_slopes,
         SLIDING_WINDOW=sliding_window_int,
-        x=key_cache.shape[4],
+        x=key_cache.shape[4] if len(key_cache.shape) == 5 else 1,
         stride_k_cache_0=key_cache.stride(0),
         stride_k_cache_1=key_cache.stride(1),
         stride_k_cache_2=key_cache.stride(2),
         stride_k_cache_3=key_cache.stride(3),
-        stride_k_cache_4=key_cache.stride(4),
+        stride_k_cache_4=key_cache.stride(4) if len(key_cache.shape) == 5 else 1,
         stride_v_cache_0=value_cache.stride(0),
         stride_v_cache_1=value_cache.stride(1),
         stride_v_cache_2=value_cache.stride(2),
         stride_v_cache_3=value_cache.stride(3),
         filter_by_query_len=True,
         query_start_len_ptr=query_start_loc,
+        # num_seqs=num_seqs,
     )
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/legacy/triton_paged_decode_attention_2d.py b/ibm-triton-lib/ibm_triton_lib/kernels/legacy/triton_paged_decode_attention_2d.py
index eb03b53c2..410a3ddeb 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/legacy/triton_paged_decode_attention_2d.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/legacy/triton_paged_decode_attention_2d.py
@@ -71,31 +71,32 @@ def cdiv_fn(x, y):
     return (x + y - 1) // y
 
 
-@triton_dejavu.jitcache(
-    # remove cache_lock if dynamic cache mode should be used
-    cache_lock=global_cache_lock,
-    # list of `tl.constexpr` that should be used as cache index
-    check_keys=["USE_ALIBI_SLOPES", "SLIDING_WINDOW", "filter_by_query_len"],
-    check_specialization=["num_seqs", "stride_k_cache_3", "stride_v_cache_3"],
-    assume_const=[
-        "scale",
-        "k_scale",
-        "v_scale",
-        "query_stride_1",
-        "output_stride_1",
-        "stride_k_cache_0",
-        "stride_k_cache_1",
-        "stride_k_cache_2",
-        "stride_k_cache_4",
-        "stride_v_cache_0",
-        "stride_v_cache_1",
-        "stride_v_cache_2",
-    ],
-    # besides this checks and assumed constants,
-    #  the cache just binds all non_const_expr
-    cache_launch_grid=True,
-)
-@triton.jit(launch_metadata=metadata_fn)
+# @triton_dejavu.jitcache(
+#     # remove cache_lock if dynamic cache mode should be used
+#     cache_lock=global_cache_lock,
+#     # list of `tl.constexpr` that should be used as cache index
+#     check_keys=["USE_ALIBI_SLOPES", "SLIDING_WINDOW", "filter_by_query_len"],
+#     check_specialization=["num_seqs", "stride_k_cache_3", "stride_v_cache_3"],
+#     assume_const=[
+#         "scale",
+#         "k_scale",
+#         "v_scale",
+#         "query_stride_1",
+#         "output_stride_1",
+#         "stride_k_cache_0",
+#         "stride_k_cache_1",
+#         "stride_k_cache_2",
+#         "stride_k_cache_4",
+#         "stride_v_cache_0",
+#         "stride_v_cache_1",
+#         "stride_v_cache_2",
+#     ],
+#     # besides this checks and assumed constants,
+#     #  the cache just binds all non_const_expr
+#     cache_launch_grid=True,
+# )
+# @triton.jit(launch_metadata=metadata_fn)
+@triton.jit
 def kernel_paged_attention_2d(
     # TODO: as soon as fixed in triton: add tl.pointer_type annotation
     output_ptr,  #: tl.pointer_type, # [num_tokens, num_query_heads, head_size]
@@ -133,11 +134,11 @@ def kernel_paged_attention_2d(
     stride_v_cache_3: tl.int64,  # int
     filter_by_query_len: tl.constexpr,  # bool
     query_start_len_ptr,  #: tl.pointer_type, # [num_seqs+1]
-    num_seqs: int,
+    # num_seqs: int,
 ):
     seq_idx = tl.program_id(0)
-    if seq_idx >= num_seqs:
-        return
+    # if seq_idx >= num_seqs:
+    #     return
     kv_head_idx = tl.program_id(1)
 
     if filter_by_query_len:
@@ -352,10 +353,10 @@ def paged_attention_triton_2d(
 
     num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv), 16)
 
-    assert num_seqs <= 4096
+    # assert num_seqs <= 4096
     kernel_paged_attention_2d[
         (
-            4096,
+            num_seqs,
             num_kv_heads,
         )
     ](
@@ -394,5 +395,5 @@ def paged_attention_triton_2d(
         stride_v_cache_3=value_cache.stride(3),
         filter_by_query_len=False,
         query_start_len_ptr=None,
-        num_seqs=num_seqs,
+        # num_seqs=num_seqs,
     )
diff --git a/scripts/setups/prefix_grid.conf b/scripts/setups/prefix_grid.conf
index bba5f879b..a299a91ed 100644
--- a/scripts/setups/prefix_grid.conf
+++ b/scripts/setups/prefix_grid.conf
@@ -22,11 +22,12 @@ PROMPT_PATTERNS = [[1.0], [0.1, 0.4, 0.5, 1.0, 0.2]]
 # PROMPT_PATTERNS = [[1.0]]
 
 MAX_VALUES = [1.0]
-# BENCHMARK_MODES = ["CUDA_EVENTS"]
-BENCHMARK_MODES = ["CUDA_GRAPHS"]
+BENCHMARK_MODES = ["CUDA_EVENTS"]
+# BENCHMARK_MODES = ["CUDA_GRAPHS"]
 
 # IMPLEMENTATION_UT = ["UNF_TRITON_2D_TUNED", "UNF_TRITON_2D_SIMPLE"]
-IMPLEMENTATION_UT = ["GRID_TRITON_2D", "GRID_TRITON_3D"]
+# IMPLEMENTATION_UT = ["GRID_TRITON_2D", "GRID_TRITON_3D"]
+IMPLEMENTATION_UT = ["TRITON_2D", "FLASH_ATTN"]
 
 # TRITON_BACKEND_DEBUG = 1
 # STORE_TEST_RESULT_PATH=/results

From 45056f5169d0fdbb89d4673b824886f99cc174ea Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Mon, 1 Sep 2025 10:02:50 -0400
Subject: [PATCH 57/61] last paper experiments

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../default/cache.json                                    | 8 ++++++++
 .../ibm_triton_lib/kernels/triton_unified_newtiles.py     | 7 ++++---
 scripts/callers/flash_attn.py                             | 2 +-
 scripts/setups/prefix_grid.conf                           | 3 ++-
 8 files changed, 47 insertions(+), 5 deletions(-)
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 create mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json

diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
new file mode 100755
index 000000000..c2b3452bf
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
new file mode 100755
index 000000000..5b55f921d
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
new file mode 100755
index 000000000..a4569e066
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
new file mode 100755
index 000000000..d3eb13852
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
new file mode 100755
index 000000000..e7d868df2
--- /dev/null
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
@@ -0,0 +1,8 @@
+{
+    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
+    "total_bench_time_s": 0.0,
+    "evaluated_configs": 0,
+    "keys": null,
+    "cache": {},
+    "timings": {}
+}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_newtiles.py b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_newtiles.py
index 7c81b4796..c10e38b08 100644
--- a/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_newtiles.py
+++ b/ibm-triton-lib/ibm_triton_lib/kernels/triton_unified_newtiles.py
@@ -665,7 +665,6 @@ def unified_attention(
             output_stride_0=out.stride(0),
             output_stride_1=out.stride(1),
             BLOCK_SIZE=block_size,
-            TILE_SIZE=TILE_SIZE_PREFILL,
             HEAD_SIZE=head_size,
             HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
             USE_ALIBI_SLOPES=use_alibi_slopes,
@@ -680,9 +679,11 @@ def unified_attention(
             stride_v_cache_2=v.stride(2),
             stride_v_cache_3=v.stride(3),
             query_start_len_ptr=cu_seqlens_q,
-            BLOCK_Q=BLOCK_Q,
             num_seqs=num_seqs,
-            BLOCK_M=BLOCK_M,
+            # tunable parameters
+            # BLOCK_M=BLOCK_M,
+            # BLOCK_Q=BLOCK_Q,
+            # TILE_SIZE=TILE_SIZE_DECODE,
         )
     elif force_selection == 3:
         # for initial version, NUM_SEGMENTS = 16 is chosen as a default
diff --git a/scripts/callers/flash_attn.py b/scripts/callers/flash_attn.py
index 6a63d778d..8db659f52 100644
--- a/scripts/callers/flash_attn.py
+++ b/scripts/callers/flash_attn.py
@@ -204,7 +204,7 @@ def call_and_process_output():
                     block_table=block_tables,
                     # window_size=(-1, 1),
                     # softcap=0,
-                    # fa_version=2, # TODO
+                    fa_version=3, # TODO
                 )
 
         return call_and_process_output
diff --git a/scripts/setups/prefix_grid.conf b/scripts/setups/prefix_grid.conf
index a299a91ed..bdde70751 100644
--- a/scripts/setups/prefix_grid.conf
+++ b/scripts/setups/prefix_grid.conf
@@ -27,7 +27,8 @@ BENCHMARK_MODES = ["CUDA_EVENTS"]
 
 # IMPLEMENTATION_UT = ["UNF_TRITON_2D_TUNED", "UNF_TRITON_2D_SIMPLE"]
 # IMPLEMENTATION_UT = ["GRID_TRITON_2D", "GRID_TRITON_3D"]
-IMPLEMENTATION_UT = ["TRITON_2D", "FLASH_ATTN"]
+# IMPLEMENTATION_UT = ["FLASH_ATTN"]
+IMPLEMENTATION_UT = ["UNF_TRITON_2D", "UNF_TRITON_3D"]
 
 # TRITON_BACKEND_DEBUG = 1
 # STORE_TEST_RESULT_PATH=/results

From 5be31b1d775b717b70106ffce529a4487d159248 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Mon, 1 Sep 2025 10:11:18 -0400
Subject: [PATCH 58/61] updating fmwork pointer

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .gitmodules        | 2 +-
 third_party/fmwork | 2 +-
 vllm               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index a3b1beaed..2c9aa057e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -9,4 +9,4 @@
 	url = https://github.com/vllm-project/vllm.git
 [submodule "third_party/fmwork"]
 	path = third_party/fmwork
-	url = https://github.com/IBM/fmwork.git
+	url = git@github.com:bringlein/fmwork.git
diff --git a/third_party/fmwork b/third_party/fmwork
index 846345f3c..2083a4e33 160000
--- a/third_party/fmwork
+++ b/third_party/fmwork
@@ -1 +1 @@
-Subproject commit 846345f3c5f8f0f42a7dbfbc297ed5cd66f09ece
+Subproject commit 2083a4e3376ba8b6318aba7b8f10b6bfb830b912
diff --git a/vllm b/vllm
index 8b1c0ff03..8ba5e3324 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit 8b1c0ff03130ba7b22f15e61b0bf68da0f30a3ca
+Subproject commit 8ba5e3324c93ea4c2b791676baa93838dbe0ca9e

From 0eea19558db3729ca3a2e354a79006a8f4677309 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 3 Sep 2025 07:56:23 -0400
Subject: [PATCH 59/61] some cleanup

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 ...256,device_name=NVIDIA_H100_80GB_HBM3.json |    146 -
 ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json |    146 -
 ...512,device_name=NVIDIA_H100_80GB_HBM3.json |    146 -
 ...384,device_name=NVIDIA_H100_80GB_HBM3.json |    146 -
 ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json |    146 -
 ...768,device_name=NVIDIA_H100_80GB_HBM3.json |    146 -
 .../default/cache.json                        |    110 -
 .../default/cache.json                        |    110 -
 .../default/cache.json                        |     98 -
 .../default/cache.json                        |     26 -
 .../default/cache.json                        |     25 -
 .../default/cache.json                        |     31 -
 .../default/cache.json                        |     26 -
 .../default/cache.json                        |     30 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     27 -
 .../default/cache.json                        |     28 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |    347 -
 .../default/cache.json                        |    387 -
 .../default/cache.json                        |    347 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |    347 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     27 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     36 -
 .../default/cache.json                        |    347 -
 .../default/cache.json                        |    387 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |    347 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     32 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     27 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     36 -
 .../default/cache.json                        |     36 -
 .../default/cache.json                        |     36 -
 .../default/cache.json                        |    387 -
 .../default/cache.json                        |     36 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     36 -
 .../default/cache.json                        |     35 -
 .../default/cache.json                        |     35 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     32 -
 .../default/cache.json                        |     32 -
 .../default/cache.json                        |     32 -
 .../default/cache.json                        |     32 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     32 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     27 -
 .../default/cache.json                        |     27 -
 .../default/cache.json                        |     27 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     27 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 triton-dejavu                                 |      2 +-
 tune_log_g4small.txt                          |  14544 --
 tuning_0.log                                  | 113327 ---------------
 vllm                                          |      2 +-
 82 files changed, 2 insertions(+), 133068 deletions(-)
 delete mode 100644 E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json
 delete mode 100644 E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
 delete mode 100644 E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
 delete mode 100644 E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
 delete mode 100644 E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
 delete mode 100644 E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_A100-SXM4-80GB/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/rocm_6.3.1/gpu_AMD_Instinct_MI250X_MI250/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-c01d6c3dfb6d587c5fb5a1edbe6d606a9804204c3305d997bb82640bf3e80282/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_6.3.1/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-90178d0ab8e71db9cd16710d562763dd010643f28cd21980d5064c3ab782ecaa/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-48e3cb6cd6592d4b55826bce9ff39781f5f8d3beec28e171da3dd4e5109ad732/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-96fc3b4e585fc8cfcb4fcdd974640839b5a5889cf4f54dbf57ad6a3439b671d0/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f52792779faa0af779cada63f2df14c185a5b34f253646e36c07bb8926f93dc8/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-88d41f86261407aa0eaf355d2d650ddaee68bdf62e28c6cc74f4e1bcacddcfd8/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3a6fc1c46225b2f7d0bc848adf5344e3dda28dcbb0957584ee22138ce6625218/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-52c92ceef6d420c78c5c5940c8b38fe551467bdabe0ca1810415fbe039359610/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-5c087adac96d09b2060f573486a99205cda08f58e544b9acfd14918832e2e582/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-72dc6d55a572ac899f3da4b41257cc6aeb8cad69a0fc94b16aa73ca9c82b4012/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-f130aa2e7a5258b0e95f6494e2db37f5dea3ccbb97ee8feed09d2d36599bff88/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-071e784de56797ed9764ebe722a0ebf6c8c9719610c15e34a8b3a8f9fe7252ae/default/cache.json
 delete mode 100644 tune_log_g4small.txt
 delete mode 100644 tuning_0.log

diff --git a/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json b/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json
deleted file mode 100644
index 147a83660..000000000
--- a/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json
+++ /dev/null
@@ -1,146 +0,0 @@
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "128": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "256": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    }
-}
diff --git a/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
deleted file mode 100644
index ac556d936..000000000
--- a/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+++ /dev/null
@@ -1,146 +0,0 @@
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "96": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "128": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
-        "num_stages": 4
-    }
-}
diff --git a/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
deleted file mode 100644
index a01e9c317..000000000
--- a/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
+++ /dev/null
@@ -1,146 +0,0 @@
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "128": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "256": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    }
-}
diff --git a/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json b/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
deleted file mode 100644
index a7cfd175d..000000000
--- a/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
+++ /dev/null
@@ -1,146 +0,0 @@
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "128": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "512": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    }
-}
diff --git a/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
deleted file mode 100644
index 79fe4dbe7..000000000
--- a/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+++ /dev/null
@@ -1,146 +0,0 @@
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "96": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 3
-    }
-}
diff --git a/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json b/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
deleted file mode 100644
index 3caae02cb..000000000
--- a/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
+++ /dev/null
@@ -1,146 +0,0 @@
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 5
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "128": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "512": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    }
-}
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_A100-SXM4-80GB/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_A100-SXM4-80GB/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
deleted file mode 100755
index 19e6fc76c..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_A100-SXM4-80GB/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
+++ /dev/null
@@ -1,110 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 211706.17069911957,
-    "evaluated_configs": 450,
-    "keys": [
-        "HQ",
-        "HK",
-        "IS_CAUSAL",
-        "dropout_p",
-        "BLOCK_DMODEL",
-        "stride_qz",
-        "stride_qh",
-        "stride_qm",
-        "stride_qk",
-        "stride_kz",
-        "stride_kh",
-        "stride_kn",
-        "stride_kk",
-        "stride_vz",
-        "stride_vh",
-        "stride_vn",
-        "stride_vk",
-        "stride_oz",
-        "stride_oh",
-        "stride_om",
-        "stride_on",
-        "stride_bz",
-        "stride_bh",
-        "stride_bm",
-        "stride_bn",
-        "stride_az",
-        "stride_ah",
-        "MAX_SEQLENS_Q",
-        "MAX_SEQLENS_K",
-        "VARLEN",
-        "ACTUAL_BLOCK_DMODEL"
-    ],
-    "cache": {
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 64, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 64, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 64, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.005401020869612694
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.005471085663884878
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.0075958045199513435
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.007605006452649832
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.011812349781394005
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.011950820684432983
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.019297460094094276
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.017475301399827003
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.038042228668928146
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.038091544061899185
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.10096532106399536
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.09481953084468842
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.2949035167694092
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.29237720370292664
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.9560787677764893
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
deleted file mode 100755
index a7b0d4282..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
+++ /dev/null
@@ -1,110 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 86841.6919836998,
-    "evaluated_configs": 240,
-    "keys": [
-        "HQ",
-        "HK",
-        "IS_CAUSAL",
-        "dropout_p",
-        "BLOCK_DMODEL",
-        "stride_qz",
-        "stride_qh",
-        "stride_qm",
-        "stride_qk",
-        "stride_kz",
-        "stride_kh",
-        "stride_kn",
-        "stride_kk",
-        "stride_vz",
-        "stride_vh",
-        "stride_vn",
-        "stride_vk",
-        "stride_oz",
-        "stride_oh",
-        "stride_om",
-        "stride_on",
-        "stride_bz",
-        "stride_bh",
-        "stride_bm",
-        "stride_bn",
-        "stride_az",
-        "stride_ah",
-        "MAX_SEQLENS_Q",
-        "MAX_SEQLENS_K",
-        "VARLEN",
-        "ACTUAL_BLOCK_DMODEL"
-    ],
-    "cache": {
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.0036645731888711452
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.0036076440010219812
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.00487453443929553
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.0048555657267570496
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.006982282269746065
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.006992792245000601
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.010331092402338982
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.010227189399302006
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.015056964010000229
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.014920394867658615
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.04663630574941635
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.04339428246021271
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.1311214417219162
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.12436506152153015
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.39030927419662476
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/rocm_6.3.1/gpu_AMD_Instinct_MI250X_MI250/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/rocm_6.3.1/gpu_AMD_Instinct_MI250X_MI250/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
deleted file mode 100755
index a7669881a..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/rocm_6.3.1/gpu_AMD_Instinct_MI250X_MI250/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
+++ /dev/null
@@ -1,98 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 86906.62447404861,
-    "evaluated_configs": 450,
-    "keys": [
-        "HQ",
-        "HK",
-        "IS_CAUSAL",
-        "dropout_p",
-        "BLOCK_DMODEL",
-        "stride_qz",
-        "stride_qh",
-        "stride_qm",
-        "stride_qk",
-        "stride_kz",
-        "stride_kh",
-        "stride_kn",
-        "stride_kk",
-        "stride_vz",
-        "stride_vh",
-        "stride_vn",
-        "stride_vk",
-        "stride_oz",
-        "stride_oh",
-        "stride_om",
-        "stride_on",
-        "stride_bz",
-        "stride_bh",
-        "stride_bm",
-        "stride_bn",
-        "stride_az",
-        "stride_ah",
-        "MAX_SEQLENS_Q",
-        "MAX_SEQLENS_K",
-        "VARLEN",
-        "ACTUAL_BLOCK_DMODEL"
-    ],
-    "cache": {
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '16', '16', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 16, BLOCK_N: 16, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '16', '16', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 16, BLOCK_N: 16, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 16, BLOCK_N: 16, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 2, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 16, BLOCK_N: 16, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 2, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 256, BLOCK_N: 64, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 256, BLOCK_N: 64, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 64, BLOCK_N: 64, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 64, BLOCK_N: 64, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 64, BLOCK_N: 64, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 64, BLOCK_N: 64, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 256, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 256, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '16', '16', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.004207286983728409
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '16', '16', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.004182395525276661
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.01809287816286087
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.017839614301919937
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.09088581800460815
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.088987797498703
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.23396557569503784
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.23347480595111847
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.6691922545433044
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.6695101261138916
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            2.025791645050049
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            2.01798415184021
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
deleted file mode 100755
index 0225f79be..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_bmm:_bmm_chunk_fwd_kernel)",
-    "total_bench_time_s": 10.309182405471802,
-    "evaluated_configs": 9,
-    "keys": [
-        "chunk_size",
-        "K",
-        "IS_CAUSAL"
-    ],
-    "cache": {
-        "('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 128, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": [
-            0.04188799858093262
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": false
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
deleted file mode 100755
index 5b20369a8..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_cumsum_fwd_kernel)",
-    "total_bench_time_s": 8.378965139389038,
-    "evaluated_configs": 7,
-    "keys": [
-        "chunk_size",
-        "nheads"
-    ],
-    "cache": {
-        "('256', '64', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": "BLOCK_SIZE_H: 32, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('256', '64', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": [
-            0.05206400156021118
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": false
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json
deleted file mode 100755
index 14c211cf5..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_scan:_chunk_scan_fwd_kernel)",
-    "total_bench_time_s": 36.24500060081482,
-    "evaluated_configs": 11,
-    "keys": [
-        "chunk_size",
-        "hdim",
-        "dstate",
-        "IS_CAUSAL"
-    ],
-    "cache": {
-        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 256, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": [
-            0.20547200739383698
-        ],
-        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32')": [
-            0.6873279809951782
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": false
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
deleted file mode 100755
index 2aeb42c51..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_fwd_kernel)",
-    "total_bench_time_s": 10.325033903121948,
-    "evaluated_configs": 9,
-    "keys": [
-        "hdim",
-        "dstate",
-        "chunk_size"
-    ],
-    "cache": {
-        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 128, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": [
-            0.08188799768686295
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": false
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
deleted file mode 100755
index 3b86e0dae..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_varlen_kernel)",
-    "total_bench_time_s": 23.77578854560852,
-    "evaluated_configs": 9,
-    "keys": [
-        "hdim",
-        "dstate",
-        "chunk_size"
-    ],
-    "cache": {
-        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 128, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16')": [
-            0.09270399808883667
-        ],
-        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16')": [
-            0.01027199998497963
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": false
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
deleted file mode 100755
index c2b3452bf..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
deleted file mode 100755
index c2b3452bf..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
deleted file mode 100755
index 60a6d6935..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 58.42541313171387,
-    "evaluated_configs": 75,
-    "keys": [
-        "dstate",
-        "BLOCK_SIZE_DSTATE",
-        "dim",
-        "nheads_ngroups_ratio"
-    ],
-    "cache": {
-        "('128', '128', '64', '128', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32')": "BLOCK_SIZE_M: 8, num_warps: 2, num_ctas: 1, num_stages: 6, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('128', '128', '64', '128', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32')": [
-            0.003274054965004325
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json
deleted file mode 100755
index 04198714b..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_state_passing:_state_passing_fwd_kernel)",
-    "total_bench_time_s": 9.725267887115479,
-    "evaluated_configs": 6,
-    "keys": [
-        "dim"
-    ],
-    "cache": {
-        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE: 512, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32')": "BLOCK_SIZE: 512, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')": [
-            0.059007998555898666
-        ],
-        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32')": [
-            0.08220800012350082
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": false
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
deleted file mode 100755
index 2540ac5c3..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index 5b55f921d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index 04eb1f234..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,347 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 34544.99443292618,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003466148627921939
-        ],
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003575095208361745
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004993442911654711
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006109926383942366
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03988393768668175
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.09943539649248123
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3283151388168335
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0377004146575928
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033776038326323032
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003488453570753336
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033901487477123737
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0032401704229414463
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004394480027258396
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004883989226073027
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0045789312571287155
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006259772460907698
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010929320007562637
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.040549296885728836
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02016238309442997
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1051921397447586
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03749670833349228
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3411431908607483
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0701025053858757
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0497854948043823
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034944734070450068
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0042336732149124146
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005933090578764677
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.026846082881093025
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07565699517726898
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2685732841491699
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8566849827766418
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003527216147631407
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004583046771585941
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0060236589051783085
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.026979871094226837
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08126690983772278
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2932415306568146
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8659728765487671
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00306075531989336
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034781373105943203
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003616524860262871
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0030675148591399193
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0038118616212159395
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003134604310616851
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0055700079537928104
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.009849821217358112
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014783395454287529
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04928915575146675
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.15255023539066315
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.013137963600456715
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4398653507232666
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.4163719415664673
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033607585355639458
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0038107747677713633
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004322108346968889
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033715730533003807
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004160675685852766
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004942106083035469
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00334966741502285
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0050212424248456955
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007804282940924168
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007798833306878805
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014028973877429962
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03204701468348503
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08394649624824524
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08103202283382416
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.23096241056919098
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006906270515173674
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.23079754412174225
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.7025490999221802
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6989444494247437
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            2.3537752628326416
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004250869620591402
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005911743268370628
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011380953714251518
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.05582933872938156
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.16943588852882385
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4909878969192505
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.5911381244659424
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index 1a8388dae..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,387 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 67657.00523352623,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 64, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 64, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034347970504313707
-        ],
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0035579479299485683
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00523252971470356
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006011391524225473
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.023085465654730797
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08206301927566528
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3279804289340973
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.1915172338485718
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033755453769117594
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003468221053481102
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00334682478569448
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0035435776226222515
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004342962987720966
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00496680336073041
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004553888458758593
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007391158025711775
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011154169216752052
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04036085680127144
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.019932862371206284
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08319558948278427
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03744187951087952
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3325899839401245
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.06968305259943008
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.184262990951538
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003470577532425523
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004544882569462061
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00577146140858531
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.022477485239505768
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04180074483156204
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.16259081661701202
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6357383131980896
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034817454870790243
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00421161251142621
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00583713548257947
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02271271124482155
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07548002898693085
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.17187528312206268
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6434140801429749
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033293836750090122
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003431792138144374
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003589486936107278
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003379078349098563
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0041108024306595325
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033878879621624947
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006029331590980291
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008353302255272865
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.013032807968556881
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04468222334980965
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1537272334098816
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01300885435193777
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.48241302371025085
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.7054001092910767
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033725856337696314
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0037622733507305384
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004256599582731724
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00334113254211843
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004093301948159933
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004860257264226675
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003374352352693677
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005010899156332016
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007828187197446823
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007898394018411636
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014706183224916458
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03305657580494881
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08440500497817993
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08125007152557373
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2514193058013916
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006724500097334385
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.22513994574546814
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8429425954818726
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6514143943786621
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            3.03377103805542
-        ],
-        "('1', '8', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033735581673681736
-        ],
-        "('1', '16', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003457766491919756
-        ],
-        "('1', '32', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003451892174780369
-        ],
-        "('1', '64', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004423843696713448
-        ],
-        "('1', '128', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004672772716730833
-        ],
-        "('1', '256', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006518691312521696
-        ],
-        "('1', '512', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010816759429872036
-        ],
-        "('1', '1024', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01876869797706604
-        ],
-        "('1', '2048', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03477397561073303
-        ],
-        "('1', '4096', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07260602712631226
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004245477728545666
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006100499536842108
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008639966137707233
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04726530611515045
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.14509893953800201
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4709869623184204
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.6025410890579224
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index 04eb1f234..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,347 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 34544.99443292618,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003466148627921939
-        ],
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003575095208361745
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004993442911654711
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006109926383942366
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03988393768668175
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.09943539649248123
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3283151388168335
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0377004146575928
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033776038326323032
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003488453570753336
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033901487477123737
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0032401704229414463
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004394480027258396
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004883989226073027
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0045789312571287155
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006259772460907698
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010929320007562637
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.040549296885728836
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02016238309442997
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1051921397447586
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03749670833349228
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3411431908607483
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0701025053858757
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0497854948043823
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034944734070450068
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0042336732149124146
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005933090578764677
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.026846082881093025
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07565699517726898
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2685732841491699
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8566849827766418
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003527216147631407
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004583046771585941
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0060236589051783085
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.026979871094226837
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08126690983772278
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2932415306568146
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8659728765487671
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00306075531989336
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034781373105943203
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003616524860262871
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0030675148591399193
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0038118616212159395
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003134604310616851
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0055700079537928104
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.009849821217358112
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014783395454287529
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04928915575146675
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.15255023539066315
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.013137963600456715
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4398653507232666
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.4163719415664673
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033607585355639458
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0038107747677713633
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004322108346968889
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033715730533003807
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004160675685852766
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004942106083035469
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00334966741502285
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0050212424248456955
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007804282940924168
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007798833306878805
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014028973877429962
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03204701468348503
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08394649624824524
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08103202283382416
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.23096241056919098
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006906270515173674
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.23079754412174225
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.7025490999221802
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6989444494247437
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            2.3537752628326416
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004250869620591402
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005911743268370628
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011380953714251518
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.05582933872938156
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.16943588852882385
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4909878969192505
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.5911381244659424
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index a7c2af725..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
deleted file mode 100755
index a4569e066..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-c01d6c3dfb6d587c5fb5a1edbe6d606a9804204c3305d997bb82640bf3e80282/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-c01d6c3dfb6d587c5fb5a1edbe6d606a9804204c3305d997bb82640bf3e80282/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index a7c2af725..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-c01d6c3dfb6d587c5fb5a1edbe6d606a9804204c3305d997bb82640bf3e80282/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index 04eb1f234..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,347 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 34544.99443292618,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003466148627921939
-        ],
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003575095208361745
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004993442911654711
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006109926383942366
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03988393768668175
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.09943539649248123
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3283151388168335
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0377004146575928
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033776038326323032
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003488453570753336
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033901487477123737
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0032401704229414463
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004394480027258396
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004883989226073027
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0045789312571287155
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006259772460907698
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010929320007562637
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.040549296885728836
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02016238309442997
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1051921397447586
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03749670833349228
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3411431908607483
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0701025053858757
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0497854948043823
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034944734070450068
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0042336732149124146
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005933090578764677
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.026846082881093025
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07565699517726898
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2685732841491699
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8566849827766418
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003527216147631407
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004583046771585941
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0060236589051783085
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.026979871094226837
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08126690983772278
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2932415306568146
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8659728765487671
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00306075531989336
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034781373105943203
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003616524860262871
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0030675148591399193
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0038118616212159395
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003134604310616851
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0055700079537928104
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.009849821217358112
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014783395454287529
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04928915575146675
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.15255023539066315
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.013137963600456715
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4398653507232666
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.4163719415664673
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033607585355639458
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0038107747677713633
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004322108346968889
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033715730533003807
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004160675685852766
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004942106083035469
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00334966741502285
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0050212424248456955
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007804282940924168
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007798833306878805
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014028973877429962
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03204701468348503
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08394649624824524
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08103202283382416
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.23096241056919098
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006906270515173674
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.23079754412174225
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.7025490999221802
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6989444494247437
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            2.3537752628326416
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004250869620591402
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005911743268370628
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011380953714251518
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.05582933872938156
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.16943588852882385
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4909878969192505
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.5911381244659424
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index d3eb13852..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
deleted file mode 100755
index e7d868df2..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_6.3.1/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_6.3.1/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
deleted file mode 100755
index 2540ac5c3..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_6.3.1/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
deleted file mode 100755
index c2b3452bf..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-90178d0ab8e71db9cd16710d562763dd010643f28cd21980d5064c3ab782ecaa/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-90178d0ab8e71db9cd16710d562763dd010643f28cd21980d5064c3ab782ecaa/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
deleted file mode 100755
index d6bd3e752..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-90178d0ab8e71db9cd16710d562763dd010643f28cd21980d5064c3ab782ecaa/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 113.2074065208435,
-    "evaluated_configs": 75,
-    "keys": [
-        "dstate",
-        "BLOCK_SIZE_DSTATE",
-        "dim",
-        "nheads_ngroups_ratio"
-    ],
-    "cache": {
-        "('128', '128', '64', '128', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32')": "BLOCK_SIZE_M: 16, num_warps: 4, num_ctas: 1, num_stages: 6, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('128', '128', '64', '128', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32')": [
-            0.0050251600332558155
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
deleted file mode 100755
index c2b3452bf..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
deleted file mode 100755
index 2540ac5c3..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
deleted file mode 100755
index 2540ac5c3..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index de8c75698..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 3830.64182972908,
-    "evaluated_configs": 540,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "is_prefill"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 32, TILE_SIZE: 16, num_warps: 8, num_ctas: 1, num_stages: 6, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 32, TILE_SIZE: 16, num_warps: 8, num_ctas: 1, num_stages: 6, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
-            0.00517149455845356
-        ],
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
-            0.00435659708455205
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index db665c68f..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,347 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 72002.96068787575,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006003436166793108
-        ],
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006077692378312349
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0066948747262358665
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008714776486158371
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03953208029270172
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08529671281576157
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.26893165707588196
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.7998318672180176
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00574119808152318
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006026116665452719
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005752653814852238
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00608863914385438
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006379257421940565
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006695704068988562
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007991316728293896
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00874169822782278
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.021478423848748207
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.038848876953125
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03919544070959091
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08279953896999359
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07393984496593475
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.26520422101020813
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.143253892660141
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8069456219673157
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006098074372857809
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006664188578724861
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008316880092024803
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.032703448086977005
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07349277287721634
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.17093537747859955
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6028901934623718
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006040927022695541
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006674066185951233
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008359000086784363
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.033145882189273834
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0726323127746582
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.16725540161132812
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6085386872291565
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00583583302795887
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00593462772667408
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006117511540651321
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0059266164898872375
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006205248646438122
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005945528391748667
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0069659799337387085
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010612651705741882
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01373966969549656
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04602960869669914
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.12627318501472473
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014789633452892303
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3502292037010193
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0954514741897583
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005718982312828302
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006129336543381214
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006283498369157314
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0057284715585410595
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0061799646355211735
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007406504824757576
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005748743191361427
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006614300422370434
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008334673009812832
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010265326127409935
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.015284508466720581
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03939511626958847
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07506544888019562
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08072267472743988
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1980127990245819
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011478512547910213
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.21105918288230896
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.5597497224807739
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.5454477071762085
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.9615601301193237
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00629243953153491
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008062037639319897
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01422079000622034
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0551898293197155
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.14126861095428467
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3813389539718628
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.2401379346847534
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index 5e025265d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,387 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 81407.73767566681,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 256, BLOCK_M: 32, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 256, BLOCK_M: 32, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0060075013898313046
-        ],
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006072512362152338
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00672190822660923
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008806715719401836
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04485657438635826
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.09946674853563309
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.35092800855636597
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.324418544769287
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0057691833935678005
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006055567879229784
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005804183427244425
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006106226239353418
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006440665107220411
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006741056218743324
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007889878936111927
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008913432247936726
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.021346861496567726
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04106005281209946
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03879227116703987
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0952981486916542
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0731193870306015
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3475594222545624
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.14168496429920197
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.324677586555481
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0060554975643754005
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006669852416962385
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008174276910722256
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03536117449402809
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07847916334867477
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.18417692184448242
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6875757575035095
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006102146580815315
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006687485612928867
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0084276357665658
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03678948059678078
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07642015814781189
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.18387676775455475
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6868319511413574
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005820533260703087
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0059619504027068615
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006105729844421148
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005979663692414761
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0062386938370764256
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005969700403511524
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007005539257079363
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011318272911012173
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01767335832118988
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.048929426819086075
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1755041629076004
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01716405153274536
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.5103733539581299
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.8636406660079956
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0057952022179961205
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006148397456854582
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006287233904004097
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005749743431806564
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006230741273611784
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007458249572664499
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00579081941395998
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006615426391363144
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00870793592184782
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01026986539363861
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.015668710693717003
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.040304314345121384
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0959310457110405
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0849064514040947
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2615358829498291
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011502742767333984
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.25011205673217773
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8817259073257446
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.7242566347122192
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            3.2800190448760986
-        ],
-        "('1', '8', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00581999821588397
-        ],
-        "('1', '16', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0058884210884571075
-        ],
-        "('1', '32', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0058985608629882336
-        ],
-        "('1', '64', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0065222084522247314
-        ],
-        "('1', '128', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008244817145168781
-        ],
-        "('1', '256', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011564841493964195
-        ],
-        "('1', '512', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.021496908739209175
-        ],
-        "('1', '1024', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.038903381675481796
-        ],
-        "('1', '2048', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07334144413471222
-        ],
-        "('1', '4096', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1418607085943222
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006298307329416275
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008144522085785866
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014301695860922337
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.06052287295460701
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1740308254957199
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4944685995578766
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.7257815599441528
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-48e3cb6cd6592d4b55826bce9ff39781f5f8d3beec28e171da3dd4e5109ad732/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-48e3cb6cd6592d4b55826bce9ff39781f5f8d3beec28e171da3dd4e5109ad732/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index a7c2af725..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-48e3cb6cd6592d4b55826bce9ff39781f5f8d3beec28e171da3dd4e5109ad732/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
deleted file mode 100755
index a4569e066..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 5b55f921d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index a7c2af725..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index db665c68f..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,347 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 72002.96068787575,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006003436166793108
-        ],
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006077692378312349
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0066948747262358665
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008714776486158371
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03953208029270172
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08529671281576157
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.26893165707588196
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.7998318672180176
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00574119808152318
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006026116665452719
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005752653814852238
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00608863914385438
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006379257421940565
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006695704068988562
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007991316728293896
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00874169822782278
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.021478423848748207
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.038848876953125
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03919544070959091
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08279953896999359
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07393984496593475
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.26520422101020813
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.143253892660141
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8069456219673157
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006098074372857809
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006664188578724861
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008316880092024803
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.032703448086977005
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07349277287721634
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.17093537747859955
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6028901934623718
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006040927022695541
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006674066185951233
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008359000086784363
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.033145882189273834
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0726323127746582
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.16725540161132812
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6085386872291565
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00583583302795887
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00593462772667408
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006117511540651321
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0059266164898872375
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006205248646438122
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005945528391748667
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0069659799337387085
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010612651705741882
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01373966969549656
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04602960869669914
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.12627318501472473
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014789633452892303
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3502292037010193
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0954514741897583
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005718982312828302
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006129336543381214
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006283498369157314
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0057284715585410595
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0061799646355211735
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007406504824757576
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005748743191361427
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006614300422370434
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008334673009812832
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010265326127409935
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.015284508466720581
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03939511626958847
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07506544888019562
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08072267472743988
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1980127990245819
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011478512547910213
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.21105918288230896
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.5597497224807739
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.5454477071762085
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.9615601301193237
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00629243953153491
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008062037639319897
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01422079000622034
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0551898293197155
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.14126861095428467
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3813389539718628
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.2401379346847534
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index a7c2af725..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index 901033d5b..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 1805.8680896759033,
-    "evaluated_configs": 540,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
-            0.003383171046152711
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index d3eb13852..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
deleted file mode 100755
index 165560713..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 115.25903606414795,
-    "evaluated_configs": 90,
-    "keys": [
-        "num_query_heads",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '128', '128', '16')": "TILE_SIZE: 128, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '128', '128', '16')": [
-            0.0028324048034846783
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
deleted file mode 100755
index e7d868df2..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
deleted file mode 100755
index c2b3452bf..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
deleted file mode 100755
index 2540ac5c3..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index 710e7b803..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 2846.828315258026,
-    "evaluated_configs": 540,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "is_prefill"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
-            0.003479903331026435
-        ],
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
-            0.003208082402125001
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 17a69de08..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 1721.1768200397491,
-    "evaluated_configs": 5400,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "is_prefill"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
-            0.004668071866035461
-        ],
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
-            0.0035326406359672546
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
deleted file mode 100755
index 870c8b475..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 863.3593587875366,
-    "evaluated_configs": 2160,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "is_prefill"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
-            0.007799518760293722
-        ],
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
-            0.006862994749099016
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
deleted file mode 100755
index 472c55180..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
+++ /dev/null
@@ -1,387 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
-    "total_bench_time_s": 32995.41111779213,
-    "evaluated_configs": 2160,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 3, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006897487211972475
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007865289226174355
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.012806367129087448
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.11409414559602737
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.36400967836380005
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.291664481163025
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            4.830662727355957
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0067154536955058575
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007009030785411596
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006567405071109533
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006921715103089809
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007554848212748766
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007870307192206383
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.012347826734185219
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.018965136259794235
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03259870782494545
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.11627256125211716
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0549253448843956
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.37127885222435
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.09950052946805954
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.3021571636199951
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1874120533466339
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            4.851548671722412
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006778071168810129
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006958519574254751
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006996186450123787
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006850973702967167
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00791214406490326
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006878295913338661
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.013943970203399658
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02429494820535183
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03789611533284187
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.15952551364898682
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.5120749473571777
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03336550295352936
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.803341269493103
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            6.802962303161621
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0067731114104390144
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007123402785509825
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.013310004025697708
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006687874905765057
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00769382668659091
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014694097451865673
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006742445286363363
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00831019226461649
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02136719599366188
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0192007627338171
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04041781276464462
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.09291289746761322
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.28874820470809937
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2564668357372284
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.918175995349884
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02123316191136837
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.7775593996047974
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            3.24080228805542
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            2.575653076171875
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            12.103424072265625
-        ],
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0063226004131138325
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0069314902648329735
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007872514426708221
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01249010395258665
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08108722418546677
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2769642770290375
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.986293613910675
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            3.6365156173706055
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0069512976333498955
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007947840727865696
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.012514323927462101
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08159603923559189
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2810220718383789
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.9966282248497009
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            3.6692380905151367
-        ],
-        "('1', '8', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0068373410031199455
-        ],
-        "('1', '16', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006867218296974897
-        ],
-        "('1', '32', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0068841795437037945
-        ],
-        "('1', '64', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007741984911262989
-        ],
-        "('1', '128', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01235784962773323
-        ],
-        "('1', '256', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02117188833653927
-        ],
-        "('1', '512', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03320121765136719
-        ],
-        "('1', '1024', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.05449502542614937
-        ],
-        "('1', '2048', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.09907654672861099
-        ],
-        "('1', '4096', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.19813136756420135
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008454970084130764
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014529259875416756
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.026538236066699028
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.18360291421413422
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.5871036052703857
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            2.0788326263427734
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            7.741743564605713
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 87360ce3e..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 859.6228244304657,
-    "evaluated_configs": 5400,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "is_prefill"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 32, TILE_SIZE: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 9, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 32, TILE_SIZE: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 9, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
-            0.007184021640568972
-        ],
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
-            0.006555985659360886
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 5b55f921d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 5b55f921d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index 47793d9a0..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 828.1587612628937,
-    "evaluated_configs": 540,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "is_prefill"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
-            0.0039040199480950832
-        ],
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
-            0.0035902990493923426
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index a83cef97e..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 363.07500290870667,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 256, BLOCK_M: 512, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            4.2064047534040583e-07
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-96fc3b4e585fc8cfcb4fcdd974640839b5a5889cf4f54dbf57ad6a3439b671d0/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-96fc3b4e585fc8cfcb4fcdd974640839b5a5889cf4f54dbf57ad6a3439b671d0/default/cache.json
deleted file mode 100755
index 6f91d97c5..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-96fc3b4e585fc8cfcb4fcdd974640839b5a5889cf4f54dbf57ad6a3439b671d0/default/cache.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
-    "total_bench_time_s": 364.13932609558105,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 3, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005123822949826717
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 5b55f921d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f52792779faa0af779cada63f2df14c185a5b34f253646e36c07bb8926f93dc8/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-88d41f86261407aa0eaf355d2d650ddaee68bdf62e28c6cc74f4e1bcacddcfd8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f52792779faa0af779cada63f2df14c185a5b34f253646e36c07bb8926f93dc8/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-88d41f86261407aa0eaf355d2d650ddaee68bdf62e28c6cc74f4e1bcacddcfd8/default/cache.json
deleted file mode 100755
index a4569e066..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f52792779faa0af779cada63f2df14c185a5b34f253646e36c07bb8926f93dc8/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-88d41f86261407aa0eaf355d2d650ddaee68bdf62e28c6cc74f4e1bcacddcfd8/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index 368c26881..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 1181.724599123001,
-    "evaluated_configs": 540,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
-            0.0031476265285164118
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 6b8ebea6e..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 720.5651552677155,
-    "evaluated_configs": 5400,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 32, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
-            0.003578872187063098
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
deleted file mode 100755
index 12932629d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 368.8641257286072,
-    "evaluated_configs": 2160,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
-            0.003861392615363002
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 02018ed3d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 362.4042990207672,
-    "evaluated_configs": 5400,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 9, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
-            0.0031293570064008236
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index d3eb13852..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index d3eb13852..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index 81ab50506..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 360.36944031715393,
-    "evaluated_configs": 540,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
-            0.0035186302848160267
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index d3eb13852..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3a6fc1c46225b2f7d0bc848adf5344e3dda28dcbb0957584ee22138ce6625218/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3a6fc1c46225b2f7d0bc848adf5344e3dda28dcbb0957584ee22138ce6625218/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
deleted file mode 100755
index d53f63026..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3a6fc1c46225b2f7d0bc848adf5344e3dda28dcbb0957584ee22138ce6625218/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 367.19957637786865,
-    "evaluated_configs": 900,
-    "keys": [
-        "num_query_heads",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '128', '128', '16')": "TILE_SIZE: 32, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '128', '128', '16')": [
-            0.0031237052753567696
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
deleted file mode 100755
index e30476d4b..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 360.16377663612366,
-    "evaluated_configs": 900,
-    "keys": [
-        "num_query_heads",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '128', '128', '16')": "TILE_SIZE: 32, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '128', '128', '16')": [
-            0.0031249839812517166
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-52c92ceef6d420c78c5c5940c8b38fe551467bdabe0ca1810415fbe039359610/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-52c92ceef6d420c78c5c5940c8b38fe551467bdabe0ca1810415fbe039359610/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
deleted file mode 100755
index acb692e9e..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-52c92ceef6d420c78c5c5940c8b38fe551467bdabe0ca1810415fbe039359610/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 102.50655031204224,
-    "evaluated_configs": 90,
-    "keys": [
-        "num_query_heads",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '128', '128', '16')": "TILE_SIZE: 16, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '128', '128', '16')": [
-            0.0022160690277814865
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-5c087adac96d09b2060f573486a99205cda08f58e544b9acfd14918832e2e582/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-5c087adac96d09b2060f573486a99205cda08f58e544b9acfd14918832e2e582/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
deleted file mode 100755
index e7d868df2..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-5c087adac96d09b2060f573486a99205cda08f58e544b9acfd14918832e2e582/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
deleted file mode 100755
index 4230de538..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 77.74497675895691,
-    "evaluated_configs": 90,
-    "keys": [
-        "num_query_heads",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '128', '128', '16')": "TILE_SIZE: 32, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '128', '128', '16')": [
-            0.002219553105533123
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-72dc6d55a572ac899f3da4b41257cc6aeb8cad69a0fc94b16aa73ca9c82b4012/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-72dc6d55a572ac899f3da4b41257cc6aeb8cad69a0fc94b16aa73ca9c82b4012/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
deleted file mode 100755
index e7d868df2..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-72dc6d55a572ac899f3da4b41257cc6aeb8cad69a0fc94b16aa73ca9c82b4012/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
deleted file mode 100755
index e7d868df2..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-f130aa2e7a5258b0e95f6494e2db37f5dea3ccbb97ee8feed09d2d36599bff88/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-071e784de56797ed9764ebe722a0ebf6c8c9719610c15e34a8b3a8f9fe7252ae/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-f130aa2e7a5258b0e95f6494e2db37f5dea3ccbb97ee8feed09d2d36599bff88/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-071e784de56797ed9764ebe722a0ebf6c8c9719610c15e34a8b3a8f9fe7252ae/default/cache.json
deleted file mode 100755
index e7d868df2..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-f130aa2e7a5258b0e95f6494e2db37f5dea3ccbb97ee8feed09d2d36599bff88/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-071e784de56797ed9764ebe722a0ebf6c8c9719610c15e34a8b3a8f9fe7252ae/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/triton-dejavu b/triton-dejavu
index 5c7d4fa99..8f06d4903 160000
--- a/triton-dejavu
+++ b/triton-dejavu
@@ -1 +1 @@
-Subproject commit 5c7d4fa9915134d1ce12c4e244015ee705cd5df3
+Subproject commit 8f06d4903056e30867620576b251489c3e9baa8c
diff --git a/tune_log_g4small.txt b/tune_log_g4small.txt
deleted file mode 100644
index e87bf0ece..000000000
--- a/tune_log_g4small.txt
+++ /dev/null
@@ -1,14544 +0,0 @@
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.21952056884766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.08415967226028, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 95.1449602842331, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.96367955207825, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.01135909557343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.8116797208786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.98880136013031, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.25296032428741, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.80431878566742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.81823897361755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.77791965007782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.6235209107399, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.30752062797545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.14223992824554, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.07647919654846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.75008189678192, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.48751831054688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.90320003032684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.32399988174438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.56895941495895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.95551943778992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.86191987991333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.41519868373871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.11391997337341, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.62847971916199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.37600016593933, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.97728019952774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.66848009824753, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.06767928600313, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.1966392993927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.42016017436981, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.24863958358765, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.62255942821503, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.56479978561401, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.26896071434021, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.28272032737732, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.04415988922119, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.0792008638382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.50959920883179, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.26288032531738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.2688010931015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.45119988918304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.7017593383789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.7870409488678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.5108813047409, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.76943969726562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.97296071052551, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.18736004829407, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.30688011646271, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.32127964496613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.01535964012146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.80000078678131, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.95936036109924, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.87296104431152, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.53248119354248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.3331196308136, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.66159987449646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.10959911346436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.59728074073792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.66848075389862, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.40544068813324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.71199989318848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.05631959438324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.78655993938446, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.29023921489716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.43103921413422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.36160099506378, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.17599940299988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.5961594581604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.2511990070343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.91151797771454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.92255985736847, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.28623974323273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.46527969837189, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.73632049560547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.31391978263855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.36176180839539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.91471946239471, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.6344004869461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.98496055603027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.47920024394989, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.38431930541992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.95120060443878, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.2299201488495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.37504053115845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.74495947360992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.05023980140686, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 140.11984050273895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.35696125030518, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.0168000459671, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.5961595773697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.19232034683228, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.04480040073395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.37583923339844, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.5473598241806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.8715192079544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.84543907642365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.47103989124298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.00479996204376, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.70496046543121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.76848137378693, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.4023984670639, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.50992047786713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.8615991473198, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.36511981487274, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.80255913734436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.90768045186996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.87664031982422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.71503937244415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.07711946964264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.30224001407623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.21104001998901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.1295998096466, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.3929613828659, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.92687910795212, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.64911925792694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.03391945362091, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.43920040130615, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.37056005001068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.80703997612, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.48832023143768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.26639986038208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.1515206694603, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.96400046348572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.19855999946594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.8088002204895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.56783890724182, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.04335874319077, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.51872050762177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.60496008396149, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.72272074222565, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.21567976474762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.23936033248901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.17279994487762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.64064049720764, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.37472021579742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.7777590751648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.35952007770538, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.8687995672226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.75968039035797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.43968081474304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.30911993980408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.02687954902649, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.53423988819122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.84975969791412, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.03359889984131, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.55631983280182, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.79584074020386, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.93856072425842, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.30256044864655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.96799790859222, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.97087967395782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.3231999874115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.31471943855286, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.96576023101807, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.367520570755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.65295994281769, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.47903847694397, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.14735960960388, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.33376026153564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.4876799583435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.05407989025116, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.07632029056549, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.60560071468353, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.70032131671906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.22111988067627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.88960099220276, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.00464046001434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.16095888614655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.5769602060318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.2671993970871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.15855932235718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.54255974292755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.0723204612732, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.99807965755463, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.6131204366684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.03247916698456, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.36495912075043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.80544030666351, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.19104027748108, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 131.7660790681839, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.06976091861725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.76255965232849, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.42143988609314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.3687995672226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.9324803352356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.37872052192688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.53839981555939, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.99808025360107, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.91008031368256, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.22000086307526, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.40383970737457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.48848032951355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.6155207157135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.22688007354736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.99743902683258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.76015901565552, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.34768152236938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.54607999324799, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.20000076293945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.48479974269867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.52512013912201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.23184096813202, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.87504029273987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.6572802066803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.28063881397247, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.78575921058655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.30751979351044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.29695904254913, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.46479988098145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.60704064369202, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.41152131557465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.95151948928833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.23456013202667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.39776062965393, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.10367977619171, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.68639945983887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.25311958789825, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.8051209449768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.3476802110672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.16704058647156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.41008043289185, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.75903964042664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.16175937652588, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.17488014698029, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.29679989814758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.19839978218079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.41855978965759, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.36767852306366, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.24863910675049, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.33199977874756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.05583798885345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.09647822380066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.77983915805817, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.7124798297882, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.70815801620483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.71151959896088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.72240233421326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.77952075004578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.56304037570953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.02832114696503, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.07695853710175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.40479922294617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.9411200284958, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.3185601234436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.49311912059784, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.10111904144287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.27872025966644, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.73104083538055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.63423943519592, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.74831986427307, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.85120010375977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.6529585123062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.87776100635529, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.250559091568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.06048035621643, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.62160098552704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.8033584356308, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.68688011169434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.04672050476074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.29199922084808, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.9167994260788, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.55887842178345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.2655987739563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.06879901885986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.35344219207764, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.46080029010773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.7460800409317, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.9990395307541, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.17199909687042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.28848087787628, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.12816047668457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.47775924205781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.46000039577484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.22399914264679, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.9752002954483, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.80512034893034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.72831928730011, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.7443196773529, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.141921043396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.5223995447159, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.50720012187958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.36864054203033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.70255970954895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.24624001979828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.74352025985718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.95888149738312, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.94751930236816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.72047889232635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.25104022026062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.62576007843018, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.32144069671631, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.92720019817352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.7012814283371, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.63279938697815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.06560027599335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.5304002761841, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.85968053340912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.41184139251709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.9236798286438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.36591958999634, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.86223900318146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.77183973789215, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.62944054603577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.11712086200714, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.81632018089294, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.0520007610321, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.4864000082016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.69871914386749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.40607988834381, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.2665604352951, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.5385603904724, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.19055914878845, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.74703991413116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.71455979347229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.6947191953659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.65887999534607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.05568087100983, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.01648020744324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.14928007125856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.02896082401276, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.32528102397919, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.6811203956604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.23071992397308, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.77631902694702, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.06111943721771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.93888032436371, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.60496175289154, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.0680000782013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.84319949150085, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.4415991306305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.13472127914429, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.74336063861847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.73904037475586, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.28559994697571, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.27183973789217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.1975998878479, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.65455877780914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 149.7278380393982, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.15903985500336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.024799823761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.91360116004944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.57583999633789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.71583950519562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 149.4928002357483, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.28992092609406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.74128079414368, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.632958650589, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.46528112888336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.60336017608643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 150.0075203180313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.48351860046387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.64655888080597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.55056059360504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.67999970912933, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.87264001369476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 150.05552113056183, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.58527982234955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.86480045318604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.49600088596344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.97728097438812, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.11856126785278, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.73776030540466, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.58143985271454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.38784039020538, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.42559957504272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.33104085922241, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.37104201316833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.50160014629364, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.28943920135498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.50591969490051, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.0841612815857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 134.6785604953766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.56159949302673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.84159994125366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.31999981403351, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.83199942111969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.42111897468567, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.46144092082977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.34768056869507, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 134.23024117946625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.45343911647797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.84159934520721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.46431994438171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.75375938415529, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.53247892856598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.52127921581268, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.80144000053406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 134.35039937496185, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.07647931575775, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.87664031982422, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.11376023292542, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.85967814922333, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.84575998783112, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.45504021644592, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.59488093852997, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.74128103256226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.36368036270142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.89103972911835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.68336057662964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.91631984710693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.44480013847351, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.45135998725891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.40288043022156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.58799934387207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.28895998001099, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.3340802192688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.92304134368896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.30864036083221, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.1041601896286, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.21472001075745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.94800066947937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.44304025173187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.37984156608582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.39584136009216, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.21984088420868, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.46896076202393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.42127895355225, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.31056010723114, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.45711827278137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.57376039028168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.37167930603027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.409921169281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.98415994644165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.2455998659134, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.7712004184723, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.44863891601562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.43071913719177, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.6139190196991, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.42127883434296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.65392065048218, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.86176061630249, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.6020803451538, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.33040022850037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.65999972820283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.97120010852814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.95343935489655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.95071983337402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.15903961658478, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.68064057826996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.98351907730103, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.36175954341888, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.51632022857666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.44208121299744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 144.69775915145874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.07408046722412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.52512013912201, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.69024002552032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.5959997177124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.2868800163269, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.76304030418396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.39680075645447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.35983788967133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.19343960285187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.71935939788818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.7367992401123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.60032165050507, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.35871911048889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.34912085533142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.36272060871124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.61008059978485, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.5107192993164, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.61135995388031, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.15823984146118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.62480127811432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.68415987491608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.47103977203369, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.61184000968933, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.23424017429352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.47151911258698, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.7670407295227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.01759910583496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.87151968479156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 117.00736105442047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.4523184299469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.98016059398651, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.85216009616852, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.68816030025482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.799840092659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.98751997947693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.63568019866943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.80623948574066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.47712087631226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.29728031158447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.5996813774109, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.76976084709167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.77055990695953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.9096006155014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.09008026123047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.53312063217163, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.26447820663452, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.49983859062195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.35967993736267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.06191968917847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.03199982643127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.27120053768158, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.6116794347763, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.5921607017517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.65600085258484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.43024051189423, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.62656140327454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.24607956409453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.0454398393631, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.67343997955322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.01455950737, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.62752044200897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.32335925102234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.9548796415329, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.6206395626068, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.89376056194305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.85776019096375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.4478394985199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.97279894351959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.38175892829895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.45024061203003, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.90512001514435, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.17328011989594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.41440045833588, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.26928174495697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.43840062618256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.7868790626526, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.65120136737823, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.12335777282715, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.25151884555817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 226.0313606262207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.28895950317383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 226.62800073623657, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.0836799144745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.52304005622864, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.67568051815033, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.33920097351074, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.82191979885101, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.00879991054535, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.1534389257431, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.25584101676941, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.10847973823547, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.828320145607, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.13487887382507, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.21135914325714, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.77072060108185, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.1289598941803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.72975957393646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.0329601764679, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.58287966251372, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.964799284935, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.12879979610443, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.29088127613068, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.74480032920837, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.1448016166687, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.63503885269165, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.68143939971924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.30496048927306, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.66271877288818, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.46016025543213, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.95600152015686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.1267204284668, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.16656005382538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.56544029712677, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.83792006969452, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.81999886035919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.80527949333191, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.81472074985504, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.24527931213377, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.70640063285828, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.6512006521225, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.0147204399109, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.72767961025238, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.4935985803604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.47487878799437, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.1350394487381, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.74000024795532, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.30271935462952, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.80384063720703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.26336252689362, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.5336000919342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.71200048923492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.774240732193, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.1995198726654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.43903994560242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.79679882526398, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.79695963859558, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.90367949008942, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.72384083271027, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.81775867938995, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.1662393808365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.96143901348114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.18992066383362, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.4731193780899, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.2235198020935, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.8363196849823, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.0247997045517, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.85967993736267, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.47679746150969, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.01503896713257, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.11951959133148, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.44816052913666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.84495902061462, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.42847967147827, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.0433599948883, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.43791949748993, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.99712014198303, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.29888021945953, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.53103935718536, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.45040047168732, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.70480108261108, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.9902400970459, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.6401596069336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.67295944690704, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.02431833744049, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.87152111530304, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.74975979328156, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.56224083900452, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.25664114952087, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.51568126678467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.98176038265228, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.36960124969482, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.4332801103592, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.68464028835297, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.0854400396347, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.6854418516159, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.23440027236938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.72208142280579, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.92464065551758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.55232048034668, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.52096033096313, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.72879981994629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.4087997674942, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.4590392112732, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.65296113491058, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.88736009597778, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.3200011253357, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.17872095108032, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.52735924720764, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.9382404088974, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.43007957935333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.02559900283813, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.87903976440428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.18287992477417, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.04304099082947, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.41071951389313, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.43984043598175, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.95152008533478, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.26992058753967, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 140.4124802350998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.09360003471375, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.29375958442688, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.38719940185547, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.85760045051575, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.08112049102783, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 140.84991931915283, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.27647960186005, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.60351896286011, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.8652799129486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.21391940116882, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.69903922080994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 140.53727984428406, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.84816062450409, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.4835205078125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.46832013130188, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.39151978492737, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.90143871307373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 140.96384048461914, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.27280080318451, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.50783979892731, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.46175932884216, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 142.08927989006042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.96495938301086, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 142.3358392715454, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.09039855003357, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 142.34272003173828, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.99744033813477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 142.0116800069809, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.1139212846756, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.59056103229523, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.37391996383667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.40623903274536, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.6054402589798, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.82912003993988, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.2590389251709, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.4771190881729, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.97791957855225, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.39360105991364, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.63711893558502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.01119816303253, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.52016067504883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.27679932117462, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.75552117824554, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.8719997406006, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.73103952407837, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.143679022789, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.49343955516815, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.17631912231445, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.16864049434662, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.76463949680328, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 144.22991931438446, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.81103885173798, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.08687925338745, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.0835200548172, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.50224137306213, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.12000048160553, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.63855934143066, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.64975929260254, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.96927952766418, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.81663870811462, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.7153605222702, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.07488214969635, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.67088079452515, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.09632074832916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.2899204492569, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.77568006515503, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 144.21407878398895, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.98543965816498, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.23311924934387, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.79552125930786, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.00015926361084, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.90256083011627, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.06847989559174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.55519950389862, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.89967954158783, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.53759944438934, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.66671872138977, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.73455953598022, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 117.17920005321503, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.48495995998383, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.64719927310944, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.82831990718842, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.00640082359314, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.74896001815796, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 117.17712044715881, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.51216053962708, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.34527909755707, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.10383975505829, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.12752103805542, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.36672019958496, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.50336027145386, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.86367869377136, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.23535978794098, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.32480013370514, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.21215963363647, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.35007977485657, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.5939199924469, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.432159781456, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.19151997566223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.4444808959961, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.4072003364563, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 142.2188800573349, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.96032106876373, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.51071846485138, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.96464002132416, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.30400002002716, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.24848008155823, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.32464051246643, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.78847980499268, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.49935948848724, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.16751968860626, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.9262398481369, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.87023985385895, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.42592060565948, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.36464059352875, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.4867205619812, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.83535861968994, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.40655982494354, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.41471982002258, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.31855976581573, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.9025604724884, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.37103962898254, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.44303977489471, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.0249594449997, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.8339204788208, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 215.62079906463623, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 957.859525680542, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 215.8238399028778, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 957.7103996276855, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 216.74816131591797, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.9918441772461, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 216.19743824005127, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 310.342880487442, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 220.659362077713, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 311.50832176208496, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 224.53823924064636, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 308.7073600292206, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 220.53872108459473, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 311.4625608921051, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 224.4108808040619, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 309.74640011787415, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 219.55552220344543, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 310.98495960235596, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 223.21711897850037, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 308.4230399131775, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 219.64751839637756, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 311.1513590812683, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 224.38512086868286, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 313.44271898269653, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.63088023662567, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.7019190788269, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.55679941177368, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.4072003364563, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 200.57920217514038, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.54207849502563, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 200.35167932510376, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1742.3779296875, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.2067174911499, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1744.2910289764404, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.79375982284546, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1742.711524963379, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.9486424922943, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1744.2323207855225, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.87616205215454, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.01952052116394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.62031960487366, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.48032069206238, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.61920046806335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 292.09808349609375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 280.1147210597992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 284.5358383655548, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.73935866355896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.8942415714264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.5022382736206, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.53183913230896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.71408128738403, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.57855796813965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 281.1942434310913, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.89647793769836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.0896019935608, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.91872119903564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.92143750190735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.91775965690613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.46143865585327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 293.72976183891296, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 293.0740785598755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 281.610723733902, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.45647621154785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.82255816459656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.9921579360962, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.35407876968384, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.0457639694214, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.85599875450134, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.91423869132996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.71984219551086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.5283179283142, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.64704275131226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.27807807922363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.508159160614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.9947168827057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.11983919143677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.1719994544983, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.00688219070432, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.21440148353577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.6347198486328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.54880023002625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.0759983062744, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.36015939712524, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.635196685791, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.87407779693604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.4654381275177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.40352058410645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.22224044799805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.44975972175598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.95392274856567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.35312247276306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.2020790576935, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.5236814022064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.7295968532562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.6262412071228, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.78448009490967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.9145622253418, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.6259183883667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.08495998382568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.9737591743469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.7124786376953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.457279920578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.2793595790863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.83663868904114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.13760018348694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 349.6822392940521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.5526382923126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 297.20735907554626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 286.7516803741455, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.6828806400299, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.03728008270264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.69167852401733, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.7803204059601, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 340.13248085975647, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.70608043670654, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 306.3313591480255, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 283.1761598587036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.5363199710846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.9785556793213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.56560254096985, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.64272141456604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.59135913848877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.4353623390198, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 299.56159949302673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.5857594013214, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.70063948631287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.99152159690857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.5889608860016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.45600056648254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.4415969848633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.3417594432831, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.79967880249023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 295.44464111328125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.6841607093811, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.87120366096497, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.46335887908936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.45728015899658, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.69248151779175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.59776306152344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.20623898506165, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.37439966201782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.82351922988892, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.10031986236572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.88816046714783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.73376178741455, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.47247910499573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.9795217514038, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.8196816444397, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.325279712677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.05903935432434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.71856093406677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.4265582561493, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.63824009895325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.48495948314667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.89199948310852, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.21136140823364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.273921251297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.92447924613953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.62288284301758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.49440050125122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.66384196281433, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.24335932731628, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.32975792884827, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.1110372543335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.0479965209961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.40528202056885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.74479913711548, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.82207894325256, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.98671865463257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.4832007884979, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.5790386199951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.31536173820496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.48175740242004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.7710394859314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.99440026283264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.50207924842834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.41551899909973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.1222424507141, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.58799982070923, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.72640132904053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.18607878684998, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.2105596065521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.2393605709076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.97424244880676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.96575951576233, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.4475221633911, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.58624362945557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.51439809799194, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.0265612602234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.7398383617401, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.18512082099915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.16031908988953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.16160035133362, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.67487788200376, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.6921582221985, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.73792147636414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.2716784477234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.1812801361084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.38911938667297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.5995206832886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 307.0958375930786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.876318693161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.8156816959381, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.9065592288971, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 319.15056228637695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.56784033775332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.78559970855713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.91088247299194, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 303.17375779151917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.9083209037781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.3523201942444, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 269.7886383533478, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 313.21776032447815, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.73791790008542, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 281.6812777519226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.67519807815552, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.09056115150452, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.99008011817932, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.23855805397034, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.23551988601685, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.90400099754333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.79983925819397, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.69999885559082, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.42991876602173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.66223859786987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.32511854171753, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.9667203426361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.7841601371765, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.87407875061035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.08240222930908, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.35088181495667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.58880019187927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.51728177070618, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.64719653129578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.23487997055054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.01328110694885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.54559922218323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.3521602153778, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.5411217212677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.48640036582947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.79248213768005, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.17456150054932, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.45440101623535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.31743693351746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.96320271492004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.69440007209778, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.26880073547363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.08368062973022, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.31856060028076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.72479915618896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.98784041404724, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.70687818527222, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.71871900558472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.0591995716095, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.6526393890381, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.16304254531863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.5201587677002, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.5073606967926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.42799925804138, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.10608005523682, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.4976007938385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.79184079170227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.6611201763153, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.9363214969635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.62848114967346, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.9859209060669, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.1092803478241, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.82399916648865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.89615988731384, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.1480007171631, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.896320104599, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.4648003578186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.84223985671997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.4737572669983, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 284.8292803764343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.12367725372314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.5462396144867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.26096200942993, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.5062370300293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.85535717010498, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.52207827568054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.00464034080505, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.47824025154114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.8764772415161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.53631949424744, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.367680311203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.0223984718323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.33823943138123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.268159866333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.0889618396759, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.8067181110382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.81071734428406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.4396777153015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.3075180053711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.84320092201233, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.187358379364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.13311767578122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.80271863937375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.39583897590637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.08655858039856, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.8209617137909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.90256071090698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.5795180797577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.66607880592346, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.0385603904724, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.0435209274292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.8940799236298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.3729588985443, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.29280114173892, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.6579215526581, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.80336093902588, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.41727948188782, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.78176021575928, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.95904159545898, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.37631821632385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.4534409046173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.32879877090454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.8732810020447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.06496000289917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.02096128463745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.1769597530365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.95407819747925, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.50239872932434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.561119556427, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.6796782016754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.4238407611847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.3622419834137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.67791867256165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.03183722496033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.99663829803467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.4945592880249, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.8449604511261, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 287.86896109580994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.31184339523315, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.7171187400818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.4345588684082, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.65344047546387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.42144060134885, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.26943850517276, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.95248079299927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 280.48160314559937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.8073606491089, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.88479804992676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.863840341568, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.07840180397034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.32480001449585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 318.296320438385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 281.1646378040314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.11536145210266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.92464232444763, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 333.23952078819275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.41232204437256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.2470405101776, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.9382390975952, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 319.0112018585205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 284.5129609107971, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.53664088249207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.92224311828613, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 338.9891195297241, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.2369611263275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.85183858871463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 269.7323191165924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 325.809121131897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 284.0215992927551, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.96768021583557, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.17887806892395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 333.4542381763458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.36319851875305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.27279806137088, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.1724796295166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 318.25583934783936, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.9756796360016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.6875183582306, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.04703974723816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 324.6737587451935, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 319.8260819911957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.1152012348175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 419.54336047172546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 307.84255862236023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.5800006389618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.23616003990173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 308.19984197616577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 400.4302382469177, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 415.4870367050171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 315.3063988685608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.09984135627747, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.7343990802765, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 315.063841342926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.21135926246643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.1636850833893, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 321.2718403339386, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.9774408340454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.6552016735077, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 306.02863907814026, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 392.4412775039673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.19839882850647, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 300.17647981643677, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.0225579738617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.6119978427887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.1150405406952, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.2388801574707, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.29615950584412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.89968276023865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.54175877571106, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.54176235198975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.50863981246948, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.8007991313934, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.05983996391296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.47103881835938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.85103750228882, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.82559943199158, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.84928178787231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.43408226966858, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.93296027183533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.54960083961487, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.7364799976349, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.97456169128418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.04367876052856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.67360281944275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.10319852828977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.76943969726562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.44080185890198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.47872114181519, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.02239799499512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.66575860977173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.80672144889832, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.90288066864014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.3140833377838, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.33375883102417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.84720134735107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.96367835998535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.1340811252594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.88079833984375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.28271985054016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.7140805721283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.33247923851013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.88559937477112, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.92959690093994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.3996813297272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.0416009426117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.35360145568848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.64287972450256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 283.1444811820984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.221120595932, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.9395182132721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.56591749191284, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.5841603279114, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.94032192230225, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.5580816268921, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.8782386779785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.3051176071167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.3062379360199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.2075209617615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.66384172439575, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 279.14639949798584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.3352026939392, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.99887919425964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.5262405872345, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.946560382843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.3987205028534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.03615975379944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 269.5625603199005, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 269.2307209968567, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 299.5020806789398, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.6351993083954, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 334.1220808029175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 311.4311981201172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 306.94656133651733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.89888215065, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 332.91375756263733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 318.68255972862244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 305.2179217338562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.6174385547638, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 324.68144059181213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 307.50240087509155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 302.46543765068054, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.1768000125885, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 326.7999994754791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 313.32687854766846, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.9155192375183, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.112318277359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.65424132347107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.95135807991028, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.82335948944092, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.8308789730072, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.21503901481628, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.98160338401794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.93824172019958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.4609603881836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.35536074638367, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.9515199661255, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.82656073570254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.333758354187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.3955225944519, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.16192078590393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.4726424217224, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.96480178833008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.79151916503906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.4827220439911, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.57679867744446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.49823999404907, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.5929594039917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.00640082359314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.96751880645752, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.54479598999023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.84208130836487, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.13327860832214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.64624071121216, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.0177628993988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.38079929351807, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.2073621749878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.36735606193545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.5886380672455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.56352066993713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.75376057624817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.1084794998169, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.60223841667175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.6502411365509, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.4468812942505, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.03775882720947, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.4503996372223, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.9881603717804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.61855959892273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.05903887748718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.181759595871, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.9526391029358, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.75744104385376, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.7089557647705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 307.1668839454651, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5600032806396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 306.48687958717346, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.3556799888611, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 309.7987174987793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.0430335998535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 304.7865641117096, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.86544203758237, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.17824029922485, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.06832218170166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.1599998474121, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.82112193107605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.24799919128418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.28224182128906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.64639830589294, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.28495693206787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.66912055015564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.73392367362976, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.87312006950378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.76671957969666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.53999948501587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.1921606063843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.95040011405945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.00752067565918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.16256165504458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.0209617614746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.87872266769406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.6051208972931, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.7390389442444, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.25311923027039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.53503966331482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.9148817062378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.4718391895294, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.39551854133606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.84287881851196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.16207909584045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.4056005477905, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.1934413909912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.5265598297119, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.2599997520447, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.70944094657898, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.88944029808047, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.3438386917114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 303.8156771659851, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.8860809803009, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.76559948921204, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 331.9489586353302, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.9116792678833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.90032243728638, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.0103991031647, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 305.0382399559021, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 305.0651204586029, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.00223898887634, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.1060814857483, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 328.5153615474701, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.91376066207886, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.2166352272034, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.3091206550598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 304.85520124435425, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 308.750718832016, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.677122592926, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 281.07760071754456, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 333.1982409954071, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.9015965461731, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.69200110435486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.76496148109436, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 301.6969621181488, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 305.0539195537567, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.70336055755615, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 278.0448019504547, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 328.02639842033386, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 301.6195213794708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 330.54239869117737, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 326.04080080986023, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.1075246334076, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.969598531723, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.0534417629242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.14287996292114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 333.5860800743103, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 326.30671858787537, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.1630401611328, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.79167914390564, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.1092805862427, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 308.8918387889862, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 339.65872049331665, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 322.49824047088623, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.6551992893219, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.41216015815735, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.408322095871, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 310.00768065452576, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 341.06191754341125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 326.85360074043274, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.3158423900604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.6566393375397, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.7044794559479, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.16608119010925, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.8992009162903, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.1785578727722, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.07280254364014, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.02735900878906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.01648330688477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.1176047325134, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.0009641647339, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.0222396850586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.36479878425598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.6985614299774, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.03632164001465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.51679849624634, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.28448057174685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.2667179107666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.8003215789795, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.69983983039856, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.87807965278625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.2582411766052, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.89519906044006, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.10096311569214, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.03392028808594, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.64735555648804, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.8367967605591, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.8977611064911, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.3278419971466, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.69487833976746, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.6022391319275, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 279.27183985710144, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.77632308006287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.29984068870544, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.9742386341095, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.2351987361908, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.29807662963867, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.14720249176025, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.1451184749603, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 283.33136081695557, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.3596796989441, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.55439949035645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.05647897720337, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 317.56208062171936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.6779193878174, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 349.53311800956726, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 283.9254403114319, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 328.9796793460846, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.03280091285706, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.37903928756714, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.36831855773926, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 315.91487884521484, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 295.1248002052307, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.2944006919861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.6284821033478, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 320.6968021392822, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.4179184436798, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.3081605434418, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.4956798553467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.79231977462769, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.16111850738525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.3575987815857, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.0916805267334, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.85792088508606, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.7716805934906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.6688003540039, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.910560131073, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.87888169288635, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.35568070411685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.7852802276611, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.56767964363098, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.1105630397797, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.27679896354678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.36527681350708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.58559846878052, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.32384085655212, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.81376194953918, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.10688066482544, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.9110412597656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.42976093292236, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.25695967674255, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.9756810665131, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.64143919944763, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 297.15904116630554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.2817602157593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 295.2950417995453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.0811195373535, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 295.78351974487305, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 291.86432123184204, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.31808042526245, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.034880399704, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.79551696777344, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.22127890586853, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.48528027534485, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.64159870147705, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.0388813018799, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.42656111717224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.20639967918396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.4167950153351, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.5758419036865, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.62799644470215, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.1078460216522, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.31056451797485, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.94576358795166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 411.25648260116577, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.98624205589294, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.9478392601013, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.14416551589966, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 428.6241555213928, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 381.84320092201233, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.6214380264282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.0388813018799, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 428.0851221084595, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.11967873573303, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.59983706474304, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.97088146209717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.95503854751587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.623681306839, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.19296288490295, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.2398376464844, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.88240218162537, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.4703993797302, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.3279995918274, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.9094362258911, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 431.24783754348755, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.7156789302826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.02800011634827, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.70239877700806, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.4281632900238, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.055522441864, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.9203209877014, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.6140785217285, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.5257587432861, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.0452761650085, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5503988265991, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.0516810417175, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.5406398773193, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.0111994743347, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6547193527222, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.8593578338623, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 306.2324810028076, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 286.6699206829071, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 335.2665615081787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.46751976013184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 278.62751960754395, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 344.0990400314331, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 319.4875192642212, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.9278407096863, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 340.2192008495331, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.33520126342773, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.98943758010864, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 344.2724812030792, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 314.9502408504486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.36911940574646, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 336.9374406337738, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 315.5336010456085, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 286.7252838611603, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.9003186225891, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.473760843277, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.367520570755, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 334.7265613079071, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 299.4095981121063, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.95967984199524, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 414.65823888778687, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.8539206981659, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.7694363594055, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.9126410484314, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 414.7548806667328, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.22975873947144, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 427.466082572937, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.46880197525024, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.79456090927124, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 298.4494411945343, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.0571210384369, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.117280960083, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.3257601261139, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 300.56959986686707, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.30672001838684, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.8596785068512, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.7779190540314, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 300.81615924835205, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.26879811286926, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.8436770439148, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.302081823349, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 297.9479992389679, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.8054401874542, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.52944111824036, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1227.089433670044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 292.55151867866516, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1205.5846405029297, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 291.9147193431854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.11039352417, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.8095977306366, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1217.0462322235107, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 286.5070390701294, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 888.1289672851562, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.9308776855469, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 886.5963172912598, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.52991771698, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 904.5785570144653, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.4401574134827, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 913.9846467971802, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.3406386375427, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.2273540496826, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.5999975204468, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 893.1643295288086, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.8686375617981, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.8012704849243, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.7043180465698, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 911.9380807876587, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.6737604141235, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 922.434720993042, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.7742404937744, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 966.8311977386475, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.282398223877, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 902.4003148078918, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.1715264320374, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 911.4887952804565, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.7982449531555, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3696.5366554260254, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.8011236190796, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3732.1895599365234, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.22015714645386, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3699.6262168884277, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.662082195282, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3706.461296081543, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.19007635116577, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.9563217163086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.55840396881104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.3790431022644, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.49184131622314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.8371181488037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.13024044036865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.67936658859253, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.29919958114624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.3075189590454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.582558631897, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.28384256362915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.3950409889221, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.87712049484253, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.2438397407532, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.7696056365967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.75872230529785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.2409596443176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.92655992507935, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.6607995033264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.481764793396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.4785614013672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.8153614997864, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.21663761138916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.93391704559326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.1708827018738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.7081604003906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.4312014579773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.1222414970398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.8798394203186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.2542381286621, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.88272428512573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.95919704437256, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.57552576065063, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.070077419281, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.23888540267944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.29759550094604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.48527574539185, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.8566403388977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.7209553718567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.8857612609863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.2379183769226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.0705609321594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.711040019989, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.6214370727539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.99391746520996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.7171139717102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.97216176986694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.5228786468506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.865918636322, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.0990409851074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.3924775123596, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.5243248939514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.476318359375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.3304009437561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.0241599082947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.72800636291504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.6454372406006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.13855934143066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.2596802711487, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.92607593536377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.67551851272583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.6871991157532, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.56575775146484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.3785548210144, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.9990358352661, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.39296531677246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.066400051117, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.92351770401, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5449628829956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8848032951355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.401282787323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.1929602622986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.7252793312073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.19904041290283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.9563179016113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.3228807449341, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.9796743392944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.3915224075317, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.7502398490906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.7787222862244, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.8779225349426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.3395233154297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.1052803993225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.9151916503906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8948850631714, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.8580799102783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.8652801513672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.5963191986084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.16176652908325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.57024002075195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.5827188491821, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.4780769348145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.6632022857666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.9329562187195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.5542378425598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.6558456420898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.94880199432373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.8142399787903, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.0744061470032, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.7159991264343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.7987289428711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.212797164917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.98303508758545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.1723213195801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.4704008102417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.0047974586487, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.35472202301025, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.5628786087036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.4870457649231, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.09455823898315, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.3423991203308, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.0419225692749, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.9107208251953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.50063848495483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 431.0747194290161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.21760272979736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.9652853012085, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.507200717926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.983521938324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.9140787124634, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.6187205314636, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.0563235282898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.42288064956665, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.0275197029114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.83583784103394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.53711891174316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.161283493042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.2231955528259, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.5696039199829, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.108962059021, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.71615505218506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.38416624069214, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.7817621231079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.5774350166321, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.7136034965515, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.8428831100464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.72543573379517, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.66991996765137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.3833613395691, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.08656215667725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.93136405944824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.4388747215271, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.25920009613037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.52256441116333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.084321975708, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.54383850097656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.1487979888916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.73967456817627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.51216411590576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.5579180717468, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.67664527893066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.00431728363037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.1380772590637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.94896268844604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.6292796134949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.11967945098877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.18064069747925, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.40784072875977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.4022407531738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.1137537956238, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.3993601799011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.7697620391846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.1428818702698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.1088018417358, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.4974431991577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.7798466682434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.57647609710693, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9835209846497, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.5660786628723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.7955160140991, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.7660789489746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.9689598083496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.82480096817017, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.4470367431641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.72655391693115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3692812919617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.60896015167236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.3014392852783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.3415951728821, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.30239725112915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.74992275238037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.39855670928955, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.07999420166016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.2233581542969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.44000339508057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.57216024398804, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.8572793006897, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.29952239990234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.784321308136, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.8839955329895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.2393593788147, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.8023953437805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.40768480300903, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.9350414276123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.70128059387207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.34560108184814, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.57952308654785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.24032258987427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.06319522857666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.0708808898926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.7772798538208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.1243152618408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.14335918426514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.3484802246094, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.5487937927246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.39183950424194, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.69983768463135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.4884886741638, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.1195139884949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.1302423477173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.23327255249023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.9713616371155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.87023878097534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.13808393478394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.71343994140625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.7079977989197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.4326367378235, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.6467170715332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.18144035339355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.6388816833496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.1617622375488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.2748794555664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.4883222579956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.4385576248169, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.05888080596924, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.13343477249146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.4577589035034, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.0025644302368, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.91552114486694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.81663751602173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.81791830062866, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.4726357460022, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 767.197916507721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.082079410553, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 786.714243888855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7886385917664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.852481842041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.534722328186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.7598419189453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.1523246765137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.7343945503235, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.99600315093994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.84351873397827, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.9803171157837, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.1148781776428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.311044216156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.8899164199829, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.68144130706787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.30592012405396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.47135972976685, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.0942358970642, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.5675196647644, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.08352184295654, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.49744272232056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.17920207977295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.28320121765137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.6542434692383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.9796848297119, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.86656188964844, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.37951707839966, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.70063638687134, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.04495573043823, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.5516800880432, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.44208240509033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.1054368019104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.99567890167236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.7369589805603, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.05903673171997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.82495975494385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.3484797477722, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.03663635253906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.28352975845337, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.36192178726196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.17983865737915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.0140805244446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.83839750289917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.83168029785156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.1084842681885, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.49359607696533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.62848377227783, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.5153613090515, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.2751979827881, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.63120794296265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.09839391708374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.91007947921753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.05375576019287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.8743968009949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.2927956581116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.1227169036865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.6068768501282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.54928064346313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.59391927719116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.27872133255005, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.84575605392456, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.6030378341675, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.4520010948181, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.05903911590576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.8240032196045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.7820816040039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.8951997756958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9270401000977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.4100775718689, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.1683259010315, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.6390390396118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.0980763435364, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.1552023887634, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.9057626724243, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.018883228302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.23583698272705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.84751892089844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.3276786804199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.6649580001831, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.94271993637085, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.0824017524719, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.14528036117554, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.624963760376, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.932963848114, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.85599422454834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.07487392425537, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.0648012161255, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.6894392967224, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.2630443572998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.29567766189575, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.5257577896118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.36144256591797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.9403233528137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.4174361228943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.61487865448, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.59791803359985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.4643168449402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.6046404838562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.8256068229675, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.37983560562134, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.76239585876465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.8902382850647, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.6880025863647, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.0209550857544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 727.4769568443298, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.9939169883728, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.1294412612915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.8537578582764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.065755367279, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.6790347099304, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.3268814086914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.7532811164856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.6017642021179, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.5286350250244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.3379249572754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.78639793396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.7025604248047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 760.4059147834778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4943985939026, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.1902389526367, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.8574376106262, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.1652812957764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.5695986747742, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.905595779419, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.0817589759827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.8324847221375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 689.0115189552307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.53136444091797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.0795192718506, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.57920026779175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.50063610076904, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.73280477523804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.6331238746643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.6124768257141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.79504013061523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.2592034339905, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.07007360458374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.1156802177429, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.1540832519531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.4987201690674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.5198383331299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.9540796279907, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.0764765739441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.64927768707275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.32623958587646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.90144443511963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.75680017471313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.53199768066406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.6596760749817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.64463901519775, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.5447964668274, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.0244812965393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.6344027519226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.3812837600708, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.2035217285156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.9411211013794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.7512001991272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.03040409088135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.6561574935913, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.0724835395813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.797119140625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.79487705230713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.45487642288214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.6020760536194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.81135845184326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.89231395721436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.06767749786377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.3732786178589, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.8678421974182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.5870451927185, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3310375213623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.16400051116943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.76976346969604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.3358373641968, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.9075255393982, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.6555185317993, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.9603171348572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.142080783844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.79616403579706, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.35599279403687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.23088359832764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.0662384033203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.3123264312744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.25360012054443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.5620756149292, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.20159816741943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.69071865081787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.54159784317017, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.4303970336914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.38575553894043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.99744176864624, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5207953453064, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.3891205787659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.4177598953247, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1387257575989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.3819231987, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.8014478683472, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.7635231018066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.9121556282043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.970883846283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 683.9639925956726, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.5492835044861, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.8779239654541, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.4555177688599, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.0505647659302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.4040040969849, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.1529560089111, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.3172812461853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.41279888153076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.99872398376465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.2251172065735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.52207708358765, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.9119944572449, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.9054388999939, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.94224309921265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.6446418762207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.9697642326355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.2275228500366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.8505549430847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.96239948272705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.3331198692322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.59007930755615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.3022389411926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.8209595680237, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.61152029037476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.81071853637695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.35375785827637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.0558409690857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.8596806526184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.2358388900757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.249764919281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.3332781791687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.341121673584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.9678440093994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.4801607131958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.36239767074585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.50975799560547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.04368591308594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.18607902526855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.6376004219055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.5953550338745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.16863918304443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.0497603416443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.24015522003174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.57952308654785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.3806447982788, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.2219228744507, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.00928258895874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.09008026123047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.334077835083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.5054392814636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.3097596168518, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.6617579460144, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.7420792579651, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.54159450531006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 918.4292793273926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.1011204719543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.8967962265015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.6216015815735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 915.5791997909546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.8563194274902, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 949.522876739502, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.990403175354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.8790340423584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.780478477478, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.468638420105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.9583992958069, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.23247957229614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.21184253692627, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.02352380752563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.8542437553406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.990882396698, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.1280016899109, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.9084792137146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.7408013343811, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.29120206832886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.7112030982971, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.6505618095398, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.3475275039673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.54944133758545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.9099168777466, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.804160118103, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.77951860427856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.10079860687256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.08528327941895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.9727964401245, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.8804793357849, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.1912021636963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.6355199813843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.6153607368469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.38479709625244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.7116723060608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.3585591316223, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.933916091919, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.7038416862488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.1604833602905, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.399516582489, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.7622413635254, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.7667169570923, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.792311668396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.26416206359863, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.0612840652465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.1763205528259, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.38415813446045, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.4963173866272, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.256959438324, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.0512022972107, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.8968005180359, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.2931170463562, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.3188824653625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.2918357849121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.6201639175415, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.13759899139404, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.5639977455139, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.2959895133972, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.8947200775146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.170560836792, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2878403663635, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.8300733566284, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.7235188484192, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.28463888168335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.5547194480896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3111972808838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.795202255249, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.3476800918579, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.31280183792114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.0838356018066, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.5984034538269, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.5651206970215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.0366430282593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.6552014350891, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.0913577079773, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.4553623199463, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.8739199638367, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.5588803291321, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.2456030845642, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.9768013954163, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.5862407684326, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.1004781723022, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.2425637245178, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.4713606834412, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.7796792984009, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.8694458007812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.1110420227051, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.9463987350464, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.807354927063, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.5708832740784, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.5168023109436, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.2422413825989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.4414420127869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.8063998222351, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.9510402679443, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 867.6070356369019, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 867.175350189209, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 876.607837677002, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 880.0158500671387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.2206449508667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 856.5936088562012, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.6395177841187, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.28239822387695, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.6736030578613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.68272161483765, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.80880212783813, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.1408009529114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.2080044746399, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.9041557312012, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.17791748046875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.8457589149475, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.82864809036255, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.9638395309448, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.3984022140503, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.29583501815796, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.7081651687622, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.9934401512146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0163216590882, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.99520111083984, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.1779217720032, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.1622462272644, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.7960014343262, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.2519965171814, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.47680377960205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.38112020492554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.260639667511, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.0785593986511, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.2048010826111, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.44480419158936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.65999603271484, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.5547204017639, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.12479972839355, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.7275195121765, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.0918426513672, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.1025576591492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.9486374855042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.7270374298096, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.3964776992798, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.0095992088318, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.7239999771118, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.3022375106812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.382559299469, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.8540754318237, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.692325592041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.0692849159241, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.1390419006348, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.959520816803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6307163238525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.6734457015991, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7107257843018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.9980821609497, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.27535247802734, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.18943786621094, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.12784004211426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.7944059371948, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.93055868148804, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.1607995033264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.3782424926758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.37776231765747, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.69552659988403, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.3015990257263, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.2847990989685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.3799991607666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.1166386604309, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.84160137176514, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.42159605026245, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.3630437850952, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.49312019348145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.51471996307373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.1001687049866, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.26128482818604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.3609666824341, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.0411195755005, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.81823539733887, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.16399765014654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.3020796775818, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2923192977905, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5092759132385, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7312026023865, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.1987190246582, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.5940837860107, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.9623975753784, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.7955207824707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.1281614303589, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.7175970077515, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.3671932220459, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.8595190048218, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.1315155029297, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.7591972351074, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.56207752227783, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 835.6031894683838, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.4662485122681, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.6188836097717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.6275200843811, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.7331228256226, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.6758451461792, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.2902393341064, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.7976050376892, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.5948877334595, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 830.9124898910522, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.8158373832703, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.8008046150208, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 858.5790395736694, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 799.6700763702393, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.0625596046448, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.6270413398743, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.1521549224854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 823.2535982131958, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.7518391609192, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.2433576583862, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 832.1303987503052, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 800.4521584510803, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 722.0651245117188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 720.3507232666016, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 836.7769598960876, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.2064108848572, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.5403218269348, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 721.1760020256042, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 851.9001770019531, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.8185601234436, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.136962890625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 730.6113648414612, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1014.4359922409059, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.2935991287231, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.8619184494017, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 996.4471960067749, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1036.9926309585571, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1004.6833562850952, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1045.6390380859375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 998.192629814148, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.0297622680664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.181914806366, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.3636798858643, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.2863955497742, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.2203235626221, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.9193625450135, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.9127955436707, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.5395121574402, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.615843296051, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.1145558357239, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.5742311477661, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.4755244255066, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.7340793609619, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.1411185264587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.6751928329468, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.7374377250671, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.435516834259, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5256028175354, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.6639986038208, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.5617570877075, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.7862424850464, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.3012833595276, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.8846440315247, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.50416231155396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.8908820152283, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.452962398529, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 785.0726413726807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 676.9409680366516, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 791.2734389305115, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.1209597587585, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.2348818778992, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 684.179515838623, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.5678429603577, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.069598197937, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.568642616272, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.3299198150635, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.6228814125061, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.4126400947571, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.7259168624878, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.2532777786255, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.3606419563293, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.1836829185486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.3316802978516, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.53648042678833, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.3808007240295, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6769647598267, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.5051217079163, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.65295696258545, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2353.8430309295654, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.8521604537964, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2359.606056213379, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.381917476654, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2369.126396179199, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.8824033737183, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2365.816173553467, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.8760004043579, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1705.1086330413818, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1187.0796871185303, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1709.7920036315918, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1207.513279914856, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1706.3822412490845, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1214.5001649856567, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1729.2284870147705, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1204.6752071380615, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1726.354742050171, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1190.2595043182373, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1732.1048164367676, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1186.7190408706665, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1709.7470474243164, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1185.4171133041382, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1713.63920211792, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1217.8872060775757, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1715.4060745239258, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1068.0207920074463, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1724.0322875976562, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1064.9443197250366, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1721.7067241668701, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1049.4910383224487, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1743.8265705108643, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.1158361434937, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6408.664588928223, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 815.3313589096069, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6532.301502227783, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.5982403755188, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6358.906688690186, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.1980857849121, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6353.005447387695, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 841.2091159820557, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.0852756500244, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.96560621261597, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.0860848426819, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.1376008987427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7555198669434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.1956729888916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.4433612823487, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.44815540313726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.150399684906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.0355176925659, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.43343925476074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.1057562828064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6828765869141, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.20159626007074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.204161643982, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.87119865417486, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.4080033302307, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.6735987663269, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.88768100738525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.0135974884033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7760000228882, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.6187224388122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.01408386230474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.68608140945435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.0433621406555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.7516837120056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.4718384742737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.5723304748535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.9081573486328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.06592035293585, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.4692831039429, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1873579025269, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.12127351760864, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.0223970413208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.32496213912964, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.9419169425965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1529603004456, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.21744441986084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.89936256408697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.1891169548035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.2335991859436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.1750416755676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.9977602958679, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9316725730896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.0199990272522, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.23136472702026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.0340785980224, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.8393588066101, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.8497619628906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.7232003211975, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.32144498825073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.3366394042969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.4559960365295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.1427221298218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3049631118774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.8111987113953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.55679845809937, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.9460768699646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.294397354126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.2887954711914, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.6310377120972, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.7464070320129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7935910224915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.8652720451355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6719999313354, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.0483255386353, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.5270366668701, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 699.3484783172607, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.7481603622437, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.3449606895447, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.8734374046326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 730.3188872337341, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.720639705658, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.4857611656189, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.7827181816101, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.4227209091187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.3163223266602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.9443206787109, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.7758402824402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 724.8345613479614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.0686411857605, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.6054368019104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 699.7318410873413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.5819187164307, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.231517791748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.835994720459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.2327971458435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.839198589325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.8398427963257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.193118095398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.4913573265076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.1529598236084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.0884766578674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1091213226318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.064160823822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.9113621711731, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.58448362350464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.8447952270508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.4459261894226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.73663902282715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.75567674636847, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.56704235076904, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.12464237213135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.7566428184509, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.83055925369257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.57632398605347, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.91792821884155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.99424171447754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.93760347366333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.45055770874023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.68336057662964, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.6025619506836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.051522731781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.7417573928833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.43215703964233, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.4963231086731, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.3262391090393, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.8684763908386, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.40191745758057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.0599961280823, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.69808626174927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.2529630661011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.89280462265015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.1897587776184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.74768114089966, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.26880168914795, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.31295919418335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.19312381744385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.6156873703003, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.93568181991577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.8990378379822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1467208862305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.9643177986145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.08560037612915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.385437965393, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.7334337234497, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.52720308303833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.60815620422363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9214396476746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9135975837708, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.98560380935663, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.01311779022217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.68336677551275, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.9588847160339, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.5678377151489, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.1737632751465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2985548973083, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.5193614959717, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.65968132019043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.51663637161255, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.5499176979065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1532826423645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.87968015670776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.9691219329834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.2126388549805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.8881559371948, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.8816003799439, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.63264179229736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3636770248413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.3644785881042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.1956796646118, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.080952167511, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6624007225037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.5057644844055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.6441602706909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.5587210655212, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9558339118958, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.396960735321, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.2228832244873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.907205581665, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.5827231407166, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.8382411003113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.0884766578674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.6764788627625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7270412445068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.233277797699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.5124821662903, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.7892827987671, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.0086398124695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.16544103622437, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.6416029930115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.19519901275635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.7753577232361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.5175995826721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.1102418899536, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.88047790527344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.1379222869873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.59039878845215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.0420804023743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.64144468307495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.0705590248108, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.71183919906616, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.72800159454346, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.91439628601074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.28063917160034, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.4681577682495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2731213569641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.6806421279907, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.23423957824707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.79680490493774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.79872035980225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.48015785217285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.9545621871948, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.79407596588135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2897605895996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.6164779663086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.4640016555786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.9468755722046, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.4727997779846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.5027174949646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.96047639846796, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.56031942367554, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.48560094833374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.9825611114502, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.60783863067627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.1030402183533, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.4342432022094, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.5916862487793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2044858932495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.70896196365356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.49471616745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.4334387779236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.93199586868286, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.98320150375366, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.86560440063477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.60128116607666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.64447879791265, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.744001865387, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.3035182952881, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.3958430290222, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.44208002090454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.0887985229492, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.906886100769, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.9582347869873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.1014375686646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.1659183502197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.7200040817261, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.0760040283203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 869.6676921844482, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.3676776885986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.78768014907837, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.96239852905273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.2587242126465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.489764213562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.5822443962097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.38720750808716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.8851227760315, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.0804810523987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.99680185317993, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.25840044021606, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.5697536468506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.9663972854614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.95616388320923, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.58015871047974, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.3919973373413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.17039251327515, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.234881401062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.96000051498413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.7324786186218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.10048151016235, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.99952459335327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.0616021156311, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.8316869735718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.04320192337036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.0323238372803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.9233589172363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.7528042793274, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.81664228439325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.3879933357239, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.72768545150757, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.854241847992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.83984327316284, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.6008014678955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.2598400115966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.09023904800415, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.21871566772455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.729278087616, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.27040338516235, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6631994247437, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3028817176819, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.19456005096436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5822310447693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.2023949623108, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.4051175117493, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.744161605835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.3804769515992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.448956489563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2846465110779, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.46095848083496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.48223924636835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.07024240493774, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.55775880813604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.2099237442017, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.2321605682373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2912049293518, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3030366897583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.72896242141724, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.7583951950073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.2696032524109, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.45199728012085, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.465916633606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.80944204330444, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4297552108765, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8574438095093, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.38479423522955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.62896394729614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6576013565063, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.303361415863, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.5648002624512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4457621574402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.4225606918335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.1566371917725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.72463893890387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.38607931137085, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.7014427185059, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.5222392082214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.4579191207886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.5759954452515, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.6089601516724, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.6377558708191, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.1385588645935, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.7558331489563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.7436771392822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.7686376571655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.9998397827148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8911991119385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.7579202651978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.1947197914124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.6977610588074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.45823907852173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.331202507019, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.0756840705872, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.8144016265869, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8078455924988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.0916843414307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.9478406906128, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.9201622009277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.7793612480164, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 826.6424036026001, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.1076788902283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 722.3720026016235, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.0847988128662, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.8398418426514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 803.6212778091431, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 829.3494415283203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.8339185714722, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.9435238838196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.683042049408, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.3646330833435, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 800.8910441398621, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.2955222129822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.8868818283081, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 734.4838428497314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.6835179328918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.7505531311035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 795.7335948944092, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 831.3246440887451, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.8571186065674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 731.4302444458008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.1179141998291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.72704219818115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.22863578796387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.07167768478394, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.3001594543457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.31008195877075, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.32496452331543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.17152070999146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.7907247543335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.2497601509094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.6894392967224, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.1008014678955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.0900812149048, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.82079696655273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.8907175064087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.8904004096985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.81584024429327, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.4513602256775, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.9657621383667, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.19792652130127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.8231997489929, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.09055852890015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.2382426261902, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.87568283081055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.51968050003046, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.94111680984497, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.9056010246277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.03504037857056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.32799768447876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.3825578689575, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.2289652824402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.18752098083496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.8846392631531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.4145579338074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.21376180648804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.9907178878784, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.2952003479004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7233514785767, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.4108853340149, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.8172783851624, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.9878387451172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.2219152450562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.44624423980713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.9891228675842, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1318402290344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4284768104553, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.9953546524047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.807680606842, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.0790395736694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.03600025177, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.6793584823608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.5267171859741, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.8039951324463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.1043186187744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9806389808654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.8462357521057, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.6281590461731, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.8124828338623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.98912191390986, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.9908757209778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.218234539032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.8076710700989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.4571237564087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.9916763305664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.9270415306091, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.6372828483582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.4788789749146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.2102379798889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.878080368042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.3063960075378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.6428799629211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.9772815704346, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.8224067687988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.6457567214966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.9532804489136, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.7401566505432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.6641621589661, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.66304063797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.433919429779, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.3926358222961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.829761505127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.0047998428345, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.7449617385864, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.08128213882446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.51520061492914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.3510456085205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.0348844528198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.78543853759766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.73440551757807, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.5628824234009, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.7692813873291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2723159790039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1662397384643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.6169595718384, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.05583572387695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.82895612716675, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.3371253013611, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.5660762786865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.5953674316406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.06799554824835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.3316817283631, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.16192150115967, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.27135276794434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.666241645813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.54416275024414, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.2276768684387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.17775774002075, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.04383945465094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.5272011756897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.2217574119568, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.9835205078125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.8321557044983, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.53727436065674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.9332790374756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.8271994590759, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.1246418952942, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7857508659363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.40608310699463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.9288001060486, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1668796539307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5225591659546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.7696051597595, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.3644866943359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.249762058258, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.0703997612, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.35423946380615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.4539179801941, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.3895974159241, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4971203804016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1055.4275178909302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.9315228462219, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.2361612319946, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.6897597312927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1055.867838859558, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.4671940803528, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1061.0974502563477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.7521634101868, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.52943992614746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.08495855331427, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.1392059326172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1971244812011, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.5630431175232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.80399799346924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.97360038757324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.8222389221192, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.5934357643127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.1843147277832, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.6988787651062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.0335998535156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.70464277267456, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.7904000282287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.380163192749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.1184000968933, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.10960149765015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.47471618652344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.03136253356934, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.75695562362665, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8500752449036, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.1115159988403, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.5495972633362, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.72704124450684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.0276794433594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.90224170684814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.6412787437439, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.24688196182257, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 699.6337628364563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.5758428573608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.2160019874573, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.3816027641297, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.4918427467346, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.9278435707092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.0838398933411, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.234236240387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.9329586029053, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.7140774726868, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.7951965332031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.8648018836975, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.5775990486145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.6791982650757, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.0948805809021, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.0353574752808, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.3470392227173, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.4479994773865, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.0177612304688, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.5779228210449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1201601028442, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.35280227661127, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4374408721924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.2092814445496, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.455677986145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.1811218261719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4419178962708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.8161606788635, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.9446401596069, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8720011711121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5518345832825, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.1255984306335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.2588815689087, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5081572532654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.2849626541138, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.2387142181396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.9342470169067, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.4647974967957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.7521586418152, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.4576063156128, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.3558435440063, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.9828791618347, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.5409641265869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.6811218261719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.7537617683411, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.9540824890137, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 730.9065628051758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 718.0647993087769, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.320164680481, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.039514541626, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.2228770256042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.3521609306335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.824960231781, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.3262419700623, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.501118183136, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.5208015441895, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.878876209259, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.4704031944275, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.6801552772522, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.2984027862549, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 940.7623863220215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.5958366394043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 953.7668800354004, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 951.1556768417358, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 944.8329639434814, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 963.8323259353638, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.7395153045654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 963.7534379959106, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.2254385948181, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.4683165550232, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.3787198066711, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.4079999923706, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.2459173202515, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.318877696991, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.6484789848328, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.2289581298828, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5846428871155, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.2350363731384, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.6835241317749, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2571206092834, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.5548787117004, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.8451209068299, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.7647972106934, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.2681574821472, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.0959987640381, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.92512702941895, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2835206985474, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1003170013428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.8113651275635, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.850239276886, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.3204736709595, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.534722328186, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2062406539917, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.7150421142578, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.4735984802246, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.7067203521729, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.6193552017212, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.5945534706116, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.5275173187256, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.3984007835388, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.6486411094666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.5971174240112, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 718.0782389640808, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.5731272697449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.7060770988464, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.7814388275146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.1334404945374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.5880002975464, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.5689625740051, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.8631939888, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 718.2345581054688, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.2115187644958, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.0481600761414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.1619215011597, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.7983980178833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.7308821678162, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.82367610931396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.340961933136, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.59951972961426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.330717086792, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.3159999847412, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.0375967025757, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.3964800834656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.955837726593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.5385570526123, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.7988820075989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.8028831481933, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.7795214653015, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.76224279403687, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5102415084839, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.5587229728699, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.95167875289917, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.45408296585083, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.69631719589233, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.9968008995056, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.6489644050598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.63295888900757, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.6289563179016, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.32287883758545, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.4457588195801, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.3753633499146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.6912040710449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.8660802841187, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.6347208023071, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.1766395568848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.0535988807678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.3782396316528, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.756959438324, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.949914932251, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4900794029236, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.9505658149719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.292640209198, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.938717842102, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1531229019165, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.5751953125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.895998954773, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.3916778564453, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 888.7593650817871, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.0353546142578, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 791.7734408378601, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 929.420166015625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 884.3414449691772, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.3147211074829, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 807.7310419082642, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.5884742736816, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 902.4166536331177, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 781.7185640335083, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 789.637439250946, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.4544010162354, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 887.3819255828857, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.3927998542786, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 809.4550561904907, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.191349029541, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 903.2862281799316, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.9759964942932, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.7872009277344, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.6812753677368, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 894.0708780288696, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 784.125759601593, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 787.0670342445374, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 929.6679878234863, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 903.9553642272949, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.4537601470947, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 789.2934417724609, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 930.4092741012573, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 892.2116899490356, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 789.1793632507324, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.3494372367859, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1137.5139141082764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1090.9420776367188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1137.4851179122925, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1101.7759990692139, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1130.6772804260254, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1092.7225637435913, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1142.1615934371948, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1100.2232074737549, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.1915183067322, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.0406422615051, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.7856040000916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.0784091949463, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.0051217079163, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.5140814781189, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.0745568275452, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.4980778694153, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.2331204414368, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.0476822853088, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.6044769287109, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.461594581604, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.7257614135742, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.5364837646484, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.5271973609924, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.1447939872742, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.4283208847046, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.2326436042786, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.593279838562, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.9751996994019, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.8041605949402, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.6540746688843, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.3017597198486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1187200546265, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 864.3463945388794, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.0249629020691, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 862.1806335449219, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.2153587341309, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 869.3523120880127, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.6206398010254, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 868.528323173523, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.3745565414429, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.306236743927, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.1201601028442, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.6222410202026, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.6719989776611, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.098560333252, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.1249570846558, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 689.7984051704407, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.312801361084, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.5449585914612, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.4169569015503, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.5604820251465, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.1217579841614, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.3001565933228, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.9342365264893, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.6128039360046, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.1451215744019, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2486.128807067871, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.3510394096375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2486.2731170654297, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.864963054657, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2493.8580989837646, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.494556427002, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2496.9319820404053, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.5116767883301, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1867.194709777832, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1290.5307245254517, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1900.1716804504395, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1321.0308742523193, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1893.198709487915, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1319.5684814453125, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1890.816307067871, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1333.3575963974, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1875.5905723571777, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1301.571192741394, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1901.9284629821777, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1325.803198814392, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1898.8699054718018, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1305.6180810928345, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1894.7088050842285, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1328.2320070266724, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1886.0566425323486, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.46928024292, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1887.835521697998, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1171.8750476837158, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1886.886568069458, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1172.1923303604126, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1892.0287895202637, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1172.137746810913, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7437.707328796387, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.2992143630981, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7446.624984741211, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.2612819671631, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7449.484100341797, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 951.5705585479736, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7462.863845825195, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 963.9700746536255, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.4683265686035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.8812799453736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.2039957046509, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.8849558830262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.535041809082, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.6169619560241, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2484784126282, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2203259468079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.99952030181885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.2609548568726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.0217633247376, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.69744157791143, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.3819198608398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.03440093994146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.8193626403809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.5635228157043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.5350422859192, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.783679485321, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.25088262557983, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.3147230148315, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.6255970001221, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.92319965362555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.0958437919617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8707165718079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.5857620239258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.54591274261475, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.5342445373535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.8804788589477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.7443189620972, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.62703847885126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.136960029602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.5027170181274, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.5225553512573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.8708806037903, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.98223972320557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.7809634208679, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.8465633392334, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1214451789856, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.06112241745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.9718308448792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.25199985504156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.18080043792725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.1675229072571, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7388806343079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.5206365585327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7491164207458, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.0622401237488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.7001576423645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.76000070571905, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.197283744812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.6582384109497, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.6271958351135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.5048017501831, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.8639969825745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.4891223907471, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.8833632469177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.92848157882685, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.8481554985046, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.08863735198975, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2897582054138, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.8878445625305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4760003089905, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.6807999610901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.4735999107361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.0507230758667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.1046380996704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.5580816268921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 727.6252770423889, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.7937569618225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.9860825538635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.1772804260254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.8801574707031, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.6022386550903, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.5713601112366, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.6812782287598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.1273603439331, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.0902342796326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.7324805259705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.580798625946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.8555135726929, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.4102449417114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.7283158302307, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.3057560920715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.580958366394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.8023977279663, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.2582440376282, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.271520614624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 756.7788791656494, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.7030410766602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.3348803520203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.9729561805725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 734.891197681427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.9734344482422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.4046382904053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.7936015129089, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 764.5177602767944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3163237571716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.20784091949463, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.74639987945557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.653603553772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.2846422195435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.91664266586304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.6219220161438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.8408017158508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1708860397339, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.87439918518066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.3262367248535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.40255975723267, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9654397964478, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.1000008583069, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.4718360900879, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1510405540466, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8039984703064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.7027220726013, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.2012801170349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.23839950561523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.1929655075073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.3583993911743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.53119945526123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.5385603904724, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8937606811523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.7992067337036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.3260793685913, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.04560232162476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.7900810241699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.5387210845947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.19727754592896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.35712242126465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.2566428184509, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.8076796531677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.5062427520752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.4223985671997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.74303865432734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.8964829444885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.195041179657, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9041628837585, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.346875667572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.742883682251, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2823967933655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.3124761581421, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.0180788040161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.20256042480474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.4777574539185, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.8342337608337, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.34767723083496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.69407510757446, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.0776033401489, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5623970031738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.295841217041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.77536582946783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6928019523621, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7587199211121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.7124800682068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.0947203636169, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.5115194320679, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.4115238189697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.649441242218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.77664709091187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.9239993095398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.5728025436401, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.7089648246765, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.0392031669617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2116832733154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.5271978378296, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.3435215950012, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.3740773200989, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.6324815750122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.4326395988464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.4683208465576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.4478397369385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.2807984352112, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.4932799339294, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.4372835159302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.9358386993408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.9153552055359, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.4440026283264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.95216274261475, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.37168264389044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.95215940475464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.6353597640991, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.2968006134033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.33807945251465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.9230399131775, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.21999740600586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.9783983230591, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5326375961303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.7496008872986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.99999380111694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.1078386306763, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.2721600532532, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.1339225769043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.41776323318476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.6593608856201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.3484783172607, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.18656301498413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.54144525527954, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.8801574707031, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0513668060303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.6087999343872, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.4169602394104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.27823972702026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.0735998153687, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.24592208862305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.59504032135004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.9980845451355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.8708758354187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.70400524139404, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5147199630737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.859034538269, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.0708780288696, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.243043422699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.9550361633301, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.24896287918085, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0473589897155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.9753608703613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.4312000274658, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4337573051453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.00447845458984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.1342363357544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.1036796569824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4849605560303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.468475818634, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4718384742737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.40480041503906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4371123313904, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.7673602104187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.7313566207886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.2281656265259, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.5443224906921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.30847787857056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.067202091217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.25247716903687, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 873.9595222473145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.3214464187622, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.6593608856201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.8228802680969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 878.1020879745483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.2180762290955, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 880.4219150543213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.1100835800171, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.27296209335327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1137628555298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.2151994705201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.0030369758606, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.88592195510864, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.97920179367065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.8961606025696, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.6764802932739, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.596800327301, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.99375581741333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.06112337112427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.85072040557867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.7419214248657, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.77120208740234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.08976078033453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.7332777976989, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.28240728378296, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.6641597747803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.22351980209345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.2849593162536, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.53344202041626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.03599643707275, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.4019227027893, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.38560771942144, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.8268775939941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.10064029693604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.7129611968994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.7127995491028, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.9963202476501, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.41280174255365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.0297555923462, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.87760210037237, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.4785590171814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.77696561813354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.9913630485534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.2907176017761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.331042766571, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.2971234321594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4036812782288, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.7142324447632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.8615975379944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.179039478302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4752068519592, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.0612831115723, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.9600014686584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.4860768318176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8635258674622, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.8441586494446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.6832003593444, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.8817615509034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2287936210632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4427247047424, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.7012791633606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.8084826469421, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.4262442588806, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.8625593185425, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.3200030326843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.30128383636475, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.8518390655518, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5222420692444, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.7392001152039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6697607040405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.3364782333374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.8494448661804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.3870453834534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3788776397705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2649564743042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.3494372367859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.9249663352966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.6672039031982, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.4611210823059, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.7062420845032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.6132788658142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.6665630340576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5395140647888, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.4452862739563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1038355827332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.5643196105957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.0839996337891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.912317276001, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2507171630859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.827995300293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.1527991294861, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.2779188156128, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.7446374893188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.9292783737183, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.7076783180237, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 676.8953657150269, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.4212794303894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8569560050964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.6057634353638, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.5054368972778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.1630387306213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7710385322571, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.9713578224182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.9702415466309, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.8340802192688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 815.1260709762573, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 849.7894334793091, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.2385592460632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 748.7366414070129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.6025590896606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.8425626754761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.4918355941772, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.1739177703857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.3099236488342, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.4212837219238, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 774.86896276474, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.1727957725525, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.2684774398804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.7425603866577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.7608008384705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.967839717865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.8961625099182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.0735983848572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 834.2803192138672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.4587154388428, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.2123212814331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 756.9198393821716, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 784.0535974502563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.164475440979, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.27888107299805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.1497611999512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5118398666382, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.98335886001587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.20271825790405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.33328008651733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.408317565918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.47759771347046, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.7579164505005, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.1075191497803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.4516825675964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.7830386161805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.10864782333374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.3663954734802, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.5196771621704, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.7567992210388, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.31120014190674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.8830361366272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3494429588318, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.59376192092896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.0660786628723, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.94495820999146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8851180076599, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.38991832733154, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.5830411911011, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.67184019088745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1459274291992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.78336334228516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.93295907974243, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.28384017944336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.3361630439758, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.2728028297424, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.6196827888489, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.5419187545776, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.9947237968445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.6750349998474, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.8462347984314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.8102374076843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.0745620727539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.0027122497559, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4966340065002, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.5771150588989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.0299229621887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.1224040985107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2390370368958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.3196802139282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.1967968940735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.1902418136597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.3068809509277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.1159982681274, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.3673648834229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.8622407913208, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.9343981742859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.4081563949585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.5344014167786, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.392156124115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4068827629089, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.2889575958252, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.9097609519958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.9139218330383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.9636859893799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.0587229728699, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.2937593460083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.913122177124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.0183973312378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.3556823730469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.4774346351624, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.9942388534546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 756.0206389427185, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.5699229240417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.0072040557861, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.5899243354797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.4566445350647, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.6368045806885, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.2313628196716, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.2142376899719, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.5833673477173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.510404586792, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.4208011627197, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.1846413612366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.8795166015625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.7809553146362, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5174403190613, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2390375137329, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.6062397956848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9942407608032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2299199104309, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.2062368392944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.08799934387207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.2209596633911, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.283196926117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.86656570434576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.8977608680725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.5569634437561, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.0595188140869, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.9227194786072, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.745436668396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.797435760498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2140765190125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0139255523681, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.3289575576782, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.31904363632196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.208963394165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.382077217102, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.26863384246826, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.9175972938538, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.4651203155518, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.64863872528076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.02175855636597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.47824239730835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.1793541908264, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.87583684921265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.739360332489, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.9312014579773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.6950354576111, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.1539249420166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.8056025505066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5804824829102, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.349123954773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.3574371337891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.0881609916687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.8896050453186, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.6312022209167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.5390415191651, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3262414932251, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.6239976882935, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.5355200767517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1065.3275203704834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.1688051223755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1065.5456018447876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.2036776542664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1066.7499113082886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.6606431007385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1070.8376026153564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.6713600158691, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.1280026435852, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.5576062202454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.0049605369568, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3812808990479, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.12672090530396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.42480421066284, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.9694356918335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.8934421539307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.0449600219727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9678421020508, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.0359992980957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.1976022720337, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1160001754761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4071955680847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.7804846763611, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.9694423675537, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.7489619255066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.1625638008118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.480637550354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2881603240967, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.7779173851013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.6758422851562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.71952056884766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.9921536445617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.597761631012, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7950415611267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.3737607002258, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.8225626945496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.4260787963867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.4784049987793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.6713557243347, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8142409324646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.1977610588074, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.2214407920837, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.7790384292603, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.812958240509, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.4497566223145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.1145629882812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.3395204544067, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.2824048995972, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.9204816818237, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.5564775466919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7796792984009, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.9665613174438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.3696007728577, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.6177606582642, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.3550367355347, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 683.5390400886536, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.5329623222351, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6483225822449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.0748791694641, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.3558411598206, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.6289596557617, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.991204738617, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.2774410247803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.7950401306152, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.5104055404663, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.3081617355347, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.029914855957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.6945600509644, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.889760017395, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.8529539108276, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.8329563140869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 686.0369610786438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.366231918335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.181921005249, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.2339253425598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.4809565544128, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.0009617805481, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 736.0620784759521, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.1799988746643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 716.3363218307495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.030562877655, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.0123243331909, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 756.300802230835, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.7283225059509, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.6806416511536, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.4948744773865, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.5555191040039, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.2014403343201, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 748.2766389846802, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.362238407135, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.7193627357483, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.5756778717041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.622880935669, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.4641585350037, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.6007976531982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.4745578765869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 967.0486497879028, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 981.858081817627, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 967.1417617797852, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 984.5600080490112, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 977.3425722122192, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 993.8640022277832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 973.7819147109985, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 986.3534450531006, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.4427223205566, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.7827177047729, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.357922077179, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9599962234497, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.480486869812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8219218254089, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.4051179885864, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.8777604103088, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.3100790977478, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.51216173171997, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.5694403648376, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.4828805923462, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.0248031616211, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3713603019714, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.4503998756409, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.5815968513489, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.5576004981995, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.74351882934565, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.2067174911499, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.5990414619446, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.411835193634, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.5153579711914, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.1009631156921, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.5553612709045, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6894435882568, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.0027203559876, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.039840221405, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.4851202964783, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.0019197463989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.6339201927185, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.1155195236206, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.5940766334534, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.7227206230164, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.7873635292053, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.6532821655273, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.4718379974365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.348482131958, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.0403227806091, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.6062397956848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.9921607971191, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.3544011116028, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4743933677673, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 730.7888007164001, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.490879535675, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.1633596420288, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.009759426117, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.9177632331848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.3723225593567, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.5254368782044, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.2932877540588, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.3552031517029, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.6464009284973, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.9086427688599, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.0392036437988, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.6619186401368, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.1876864433289, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.60911846160883, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.2755198478699, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.253764629364, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.6262397766114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.22016048431396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.2065658569336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.056797504425, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.15903711318964, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2193646430969, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.51519918441767, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.41839694976807, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8806443214417, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.24096345901495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.00079870224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.11279296875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.0025644302368, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.5606412887573, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.5956768989563, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.9220786094666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.1953620910645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.1167960166931, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.9855990409851, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.8329563140869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.1136012077332, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.9009585380554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.4697608947754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.7167973518372, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.9011149406433, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.1464066505432, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.9596853256226, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.6259198188782, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.7163186073303, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.8788795471191, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.3723182678223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.2215976715088, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.4881620407104, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 937.4750423431396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 910.9724760055542, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.9476799964905, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 824.2094421386719, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.301760673523, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.9924745559692, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 803.9201641082764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.0571212768555, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 937.4425601959229, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 917.7323198318481, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 807.1347188949585, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.3014450073242, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.6163158416748, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.038722038269, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 807.0151948928833, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.3988752365112, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.3508749008179, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 909.0921545028687, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 812.5785684585571, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 828.8521575927734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 935.1947164535522, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 937.8656005859375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 807.895359992981, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.254716873169, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 934.7740745544434, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 913.4953594207764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 808.8092851638794, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.6372833251953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1162.8575944900513, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1116.4334392547607, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1167.4163103103638, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1125.512957572937, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1170.0452756881714, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1124.5208024978638, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.3548774719238, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1126.8772792816162, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.5449638366699, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.9435224533081, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.5649628639221, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.4172763824463, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.9807939529419, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.181441783905, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.1183996200562, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.6353578567505, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.673436164856, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.9851198196411, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.6353588104248, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.5921626091003, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.7888011932373, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.161762714386, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.8716778755188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.1267204284668, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.9352049827576, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.2806377410889, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.553759098053, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.0364799499512, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.9164791107178, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.9308862686157, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.1262383460999, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.6832041740417, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.9193496704102, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.7996706962585, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.9024000167847, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.686719417572, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 879.5030403137207, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.5804800987244, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.535831451416, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.1651167869568, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 768.022563457489, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.7870440483093, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 701.6942381858826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.4657578468323, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 773.1475162506104, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.4145665168762, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.575680732727, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.0785593986511, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 774.6984004974365, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.0764765739441, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.1355142593384, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.4337606430054, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.0345640182495, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.5566358566284, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.136157989502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.943356513977, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2508.334894180298, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.6436805725098, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2514.455032348633, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.0075182914734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2523.786735534668, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.0100803375244, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2527.8945636749268, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.1134457588196, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1932.8961658477783, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1341.5974426269531, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1942.062873840332, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1347.3326444625854, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1935.0201511383057, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1346.166877746582, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1949.9556732177734, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1365.85120677948, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1933.2398414611816, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1348.529920578003, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1949.1075229644775, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1369.346890449524, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1938.2436752319336, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1351.6355180740356, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1949.699535369873, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1366.9753646850586, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1873.7366390228271, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1179.4961547851562, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1870.349416732788, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1183.728632926941, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1871.7929649353027, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1183.046236038208, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1880.8094501495361, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1183.2462406158447, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7475.730094909668, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 969.1713571548462, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7479.1412353515625, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.0225601196289, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7483.934288024902, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 964.6747159957886, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7496.789436340332, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 971.7057609558105, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.6713662147522, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.8851232528687, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.6027212142944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.5193600654602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.4046368598938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.1734399795532, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.0147247314453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.209282875061, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.1667170524597, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.7363204956055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.9700798988342, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.7659244537354, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.0740852355957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.6647968292236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.0038404464722, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.0582399368286, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.7942471504211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.1094398498535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.713595867157, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.670560836792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.5830335617065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.1243262290955, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.619038105011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.9561624526978, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.3785552978516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.0180811882019, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.2615966796875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.8390383720398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.2619218826294, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.9067144393921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.0630431175232, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.2393565177917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.4468803405762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.7148795127869, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.1697587966919, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.9476790428162, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.2892804145813, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.7883162498474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.1844792366028, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.7217602729797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.924159526825, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.1017599105835, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.7158427238464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.360643863678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 684.3721628189087, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.601279258728, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.3966360092163, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.652322769165, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.9367957115173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.0846381187439, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.2703990936279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.9737615585327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 684.6873593330383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.2043290138245, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.900803565979, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.6604833602905, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.9254412651062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.320484161377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.006402015686, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 637.566237449646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.8217625617981, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.6489610671997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.5377588272095, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.4278383255005, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.0580821037292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 720.7521629333496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1009.0684795379639, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.0921697616576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.5444717407227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 795.1929616928101, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1114.061918258667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1154.6329593658447, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.7137603759766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 720.7259202003479, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1016.3032007217407, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1034.8028755187988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.8511934280396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.7164750099182, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1117.9559993743896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1153.8620805740356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.6462388038635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.2187232971191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1001.5480089187623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1029.5339107513428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 815.8494400978088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 783.7220740318298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1108.8283157348633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.9406394958496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.6627163887024, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.2159967422485, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1005.9084796905518, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1040.8699131011963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 814.3107175827026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 794.0518426895142, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1116.924638748169, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1147.9395151138306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1267156600952, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.82159948349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.1087980270386, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.0385665893555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.9025611877441, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.1406364440918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.3328008651733, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.7372808456421, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.5174479484558, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.080801486969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.1790399551392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.952962398529, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.1627202033997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.3729600906372, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.1169624328613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.6204733848572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.0316743850708, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.03040599823, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.0777683258057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.1158423423767, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.1169624328613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.734076499939, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.0569581985474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.7198433876038, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.67568063735956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.9483199119568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.3017654418945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.7483205795288, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.5814366340637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8699178695679, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.1345596313477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.4087963104248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.3142371177673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.8596787452698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.5419187545776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.4558439254761, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.9710369110107, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.161759853363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.0516781806946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.2230386734009, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5481648445129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.3644776344299, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.406081199646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.2358388900757, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.8279943466187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.7825598716736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.4593605995178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.5555186271667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.3673620223999, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.1907229423523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.0174431800842, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.470561504364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.4100794792175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.6692814826965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.7447986602783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 663.4708762168884, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.8035154342651, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.7553558349609, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.5339183807373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.3369617462158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.0831990242004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.6193590164185, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.316478729248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.8935956954956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.8748788833618, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.6583957672119, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.2356820106506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.2271971702576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.1395163536072, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.2142362594604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.526713848114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.6163196563721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.042558670044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.1929664611816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.3739161491394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.8544034957886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.5081577301025, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.6859254837036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.7687983512878, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.903196811676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.1617588996887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.0119996070862, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.4231996536255, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.7622404098511, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6711988449097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.9763188362122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.2238402366638, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.5916819572449, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.0360078811646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.9438409805298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.898886680603, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.6342406272888, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7265605926514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.2513599395752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.3668761253357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.036322593689, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.0531187057495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.967200756073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.4104042053223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.6084780693054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.4782409667969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3795185089111, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.6808032989502, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.0079989433289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.4396843910217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.9755210876465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.0873599052429, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.6600017547607, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4667267799377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.7835211753845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.511522769928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.9318370819092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.8947186470032, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.505922794342, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.5931210517883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.5510406494141, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.2048034667969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.8928046226501, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.3248019218445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.7961616516113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.4975981712341, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5804858207703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.3884830474854, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.4143953323364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.9025626182556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.9694428443909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.1732788085938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.5456023216248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.04816198349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.990716457367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.1780805587769, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.9928021430969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.8092789649963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.7881650924683, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.4100842475891, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.0059237480164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1144.3814420700073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.7180852890015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1148.235206604004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.495837688446, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1151.7169618606567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.9108786582947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1154.2894315719604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.6063995361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.2542419433594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.8588809967041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.8124809265137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.4609594345093, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.8584027290344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.3590393066406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.7190327644348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.6623978614807, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.488000869751, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.9628767967224, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.197274684906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.1811203956604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.4787158966064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.7526388168335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.5280017852783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.4454426765442, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.6332874298096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.001437664032, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.0849566459656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.4868807792664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.8779172897339, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.4171242713928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.8990454673767, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.05215883255, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.9527988433838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.4622392654419, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.4916753768921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.3515167236328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.8542385101318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.0782413482666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.0983963012695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.3134436607361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.5363283157348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.1740880012512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.1343951225281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2897567749023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.4091172218323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.1241555213928, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.6756825447083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.6694359779358, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.52176332473755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4608058929443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.0254459381104, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.8531193733215, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.7758402824402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.1686396598816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.019202709198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.7179160118103, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.6121602058411, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0048007965088, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.6948852539062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.2860770225525, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.8935966491699, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.7124810218811, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.154399394989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.7255954742432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.811683177948, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8049640655518, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.7662420272827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.0038361549377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.4740796089172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.032799243927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.7988796234131, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.2593545913696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.94864320755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.9268846511841, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.0091166496277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.4808011054993, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.3547253608704, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.0809607505798, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.5339198112488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.300802230835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.301281452179, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.756317615509, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.2585587501526, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.8273596763611, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.1979155540466, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.6921591758728, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.1489591598511, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.8887991905212, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.9512014389038, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.1457605361938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.2275233268738, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.5281620025635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.7999978065491, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.8827204704285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.3004789352417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 697.3012828826904, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.7313613891602, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.3812766075134, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.387686252594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.3596749305725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.005443572998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.109920501709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.8323173522949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.567684173584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.312162399292, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.4390487670898, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 871.0507202148438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.6833591461182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 786.16783618927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 797.6203179359436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 650.427520275116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.6073598861694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.4502382278442, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.580641746521, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.9768013954163, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 791.4603209495544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.9311981201172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.8056135177612, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 868.8295888900757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.5865564346313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.623366355896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 794.4684815406799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.7107300758362, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 863.1739234924316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.4688024520874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.9454379081726, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 792.3839998245239, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.8836889266968, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.4883232116699, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.893753528595, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.0329637527465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5475211143494, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2343997955322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.5740795135498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1735982894897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.0507230758667, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.6592001914978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.63615894317627, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.43184661865234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.9995174407959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.7878384590149, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.33216381073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.2516784667969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6526412963867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.9236760139465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.9419240951538, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1302423477173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.0460858345032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.3809537887573, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6100778579712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.390073299408, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6424036026001, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.71856021881104, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.370080947876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.663996219635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.4319982528687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.4270358085632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1572771072388, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.6132822036743, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.001763343811, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.0502376556396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.2239990234375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.7252779006958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.8761615753174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.212480545044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.2411203384399, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.6987233161926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.2334442138672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.9057574272156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.7734408378601, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.7116866111755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.1684803962708, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.4588813781738, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.2193641662598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.5753588676453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.8507256507874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.5831990242004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.91583776474, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.0723218917847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.6406421661377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.1825585365295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.177761554718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.8227186203003, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.2686395645142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.3761596679688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.694239616394, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.731207370758, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.8703980445862, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.6918387413025, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.6753611564636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.781277179718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.4887948036194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.0614433288574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.7217597961426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.9131212234497, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.784637928009, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.2627205848694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.5252766609192, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.6939206123352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.7620787620544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.3003196716309, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 774.2734432220459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.3854374885559, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.4878416061401, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.5934376716614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 767.8214430809021, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.0168042182922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.9539203643799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.4361577033997, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.73648166656494, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.8363256454468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.4070334434509, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1123223304749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.7001643180847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1643223762512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.7086415290833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.0047979354858, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.4697570800781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6187181472778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.414234161377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.5281586647034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.5145573616028, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.5441589355469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.1076803207397, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.5953569412231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.3011207580566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.781596660614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.1494436264038, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.1592030525208, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.0659184455872, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.4612803459167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.9817600250244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.356960773468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.6849637031555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.3753528594971, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.2321605682373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.7380752563477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.740475654602, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8348798751831, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.3731231689453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7470369338989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.2252793312073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.5422420501709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.1472010612488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4108815193176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.8107171058655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.8635206222534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.1417646408081, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.7251200675964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.0907163619995, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.9886436462402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.8934426307678, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5759978294373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.6726403236389, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.4998464584351, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.4041676521301, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1076.6366386413574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.8084797859192, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1079.6487951278687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.598714351654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1082.57408618927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.8479986190796, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1088.6448001861572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.7835149765015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.5625591278076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.8921551704407, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.575840473175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.6927995681763, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.4260811805725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.15808248519903, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.065279006958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.5601649284363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.2265601158142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.6760034561157, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.738558769226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.4280014038086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.5155172348022, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.4579191207886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.2415995597839, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.5931262969971, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8719964027405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.5687975883484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.923360824585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.0590362548828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.4257607460022, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.8180747032166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.4412860870361, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.9108805656433, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.6484794616699, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.3580794334412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.2320036888123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.8339157104492, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.9756836891174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.5992031097412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.0547204017639, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.9952020645142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4049587249756, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.0150356292725, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.4175987243652, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.8760032653809, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.882246017456, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.6726403236389, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.9423985481262, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.2865600585938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.254554271698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.7393655776978, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.3375973701477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.9926424026489, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.2015962600708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.0145578384399, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.8340797424316, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.0331211090088, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.14319896698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.2731223106384, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.5272002220154, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.6798377037048, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.8075256347656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.1019263267517, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.7054409980774, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.764479637146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.3817558288574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.2521662712097, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.5632004737854, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 663.4691143035889, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.9612817764282, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.2305612564087, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.7356758117676, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.3982396125793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.8871955871582, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 749.4414401054382, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.564001083374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.2438387870789, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 774.2369604110718, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.3542394638062, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.6750478744507, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.8524780273438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.4099254608154, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.1587224006653, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 771.065924167633, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.6823945045471, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 650.2564787864685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.3151965141296, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.4660768508911, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 759.3793654441833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.9107146263123, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 769.486403465271, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.2800006866455, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.218403339386, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.6131181716919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 758.4811210632324, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.8339152336121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.0292820930481, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 996.4808034896851, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.601279258728, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 992.7102375030518, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.5784053802491, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1005.8182382583618, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.4287977218628, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1006.7497634887694, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.1715202331544, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.5576019287109, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.6020832061768, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.6030349731445, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.2105631828308, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.1219258308411, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.1479992866516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.8887987136841, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.1942348480225, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.7524819374084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.6739249229431, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.3076763153076, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.4382433891296, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.2767949104309, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9227228164673, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.2659244537354, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.2568025588989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.70383644104, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.4332799911499, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.380156993866, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.9524827003479, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.2622423171997, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7368021011353, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.5859236717224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.0787234306335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.0035157203674, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.995840549469, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.1692771911621, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.7068791389465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.1044821739197, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5995163917542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.7801637649536, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.5883226394653, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.4103960990906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.3209552764893, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.2929592132568, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.0465593338013, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.4545550346375, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.3747138977051, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.3230395317078, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.5823993682861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.5529594421387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.3535962104797, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.3708744049072, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.6663956642151, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.5982398986816, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.0641598701477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.6276807785034, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.7433590888977, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6271982192993, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.0913624763489, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9884777069092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8231983184814, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.1590437889099, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.7718362808228, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.0752058029175, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.4724817276001, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.4711985588074, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.7331275939941, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.2932796478271, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.2630400657654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1612801551819, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.3769550323486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.9544053077698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6295971870422, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.9455981254578, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2756862640381, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.4188771247864, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.3984022140503, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.3758397102356, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.0684819221497, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.5692868232727, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4145693778992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.6209635734558, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.2206382751465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.5919995307922, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.1087942123413, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.7662410736084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.675359249115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.3584036827087, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.3766374588013, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.719202041626, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.6025557518005, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.5700764656067, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.8479995727539, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.3670401573181, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.6673564910889, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.5814433097839, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.0843200683594, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 942.0443248748779, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 948.3080005645752, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.94225025177, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 830.9920024871826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.7846536636353, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.841588973999, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.9993591308594, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 828.1028842926025, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.4806480407715, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.6609582901001, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 824.8297595977783, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 834.2294263839722, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.6097621917725, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.8777656555176, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.3409585952759, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.2539157867432, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 943.7436819076538, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 953.3852815628052, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.9041595458984, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 834.291672706604, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.3051252365112, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 935.8153629302979, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.9539127349854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.5467138290405, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 945.5777549743652, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 958.6444711685181, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 828.9371252059937, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 836.2817716598511, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 940.1252841949463, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.4692897796631, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.5091238021851, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 827.9468774795532, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1188.7638425827026, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1139.0116786956787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1192.6312065124512, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1144.9139213562012, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1196.6630458831787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1148.8268852233887, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1199.1324853897095, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1151.2102365493774, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.3561611175537, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.2972812652588, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.8870348930359, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.4310455322266, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.9700808525085, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.2256026268005, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.1779141426086, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.4120001792908, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.2089548110962, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.3401656150818, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.3679986000061, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.8452777862549, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.056960105896, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 663.0862379074097, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.4846396446228, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.0979180335999, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.2030415534973, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.6873650550842, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.217921257019, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.5999975204468, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.6790399551392, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.2347226142883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.4379177093506, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.4911932945251, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 893.0113649368286, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.9203195571899, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 897.3352003097534, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.5142402648926, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 903.1423902511597, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.7889585494995, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 904.4144010543823, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 769.8015999794006, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.8886394500732, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.925440788269, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.5025572776794, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.8607997894287, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.2179217338562, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.1444764137268, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.8600015640259, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.758243560791, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 794.1782355308533, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.2632012367249, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.0107216835022, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.5444741249084, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 796.6763210296631, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.3857679367065, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 716.9395160675049, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.1494383811951, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2539.4411087036133, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.7367973327637, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2552.7148723602295, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.6376004219055, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2568.1479930877686, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.1838355064392, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2573.318395614624, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.6927938461304, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1952.103033065796, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1359.0283298492432, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1958.6380863189697, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1372.5860738754272, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1959.9724960327148, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1364.3467140197754, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1967.141752243042, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1370.417766571045, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1964.930076599121, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1362.9128122329712, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1967.912302017212, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1379.5636749267578, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1965.8495903015137, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1365.1654386520386, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1970.8820724487305, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1378.2129621505737, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1868.5980892181396, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1193.9454507827759, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1866.2722873687744, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1197.3190450668335, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1872.7472019195557, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1199.8555278778076, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1877.0246410369873, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1198.0375957489014, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7500.65071105957, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 968.5006332397461, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7496.341361999512, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.2843151092529, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7503.418045043945, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 981.8358421325684, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7526.106185913086, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 985.64528465271, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1259.6371126174927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1229.0788793563843, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1267.1767902374268, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1338.947515487671, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1885.2336120605469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1829.1235160827637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1836.0921669006348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1887.8436851501465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1254.809913635254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1235.1792001724243, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1267.8843212127686, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1330.5446434020996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1872.5360107421875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1859.3099308013916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1874.436321258545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1921.0147285461426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1252.7068710327148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1234.2902421951294, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1268.9958429336548, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1333.0531215667725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1888.139820098877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1859.1505718231201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1876.254072189331, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1926.902084350586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1247.8775882720947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1232.8001594543457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1263.5199975967407, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1334.09423828125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1880.45503616333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.8167896270752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1862.0091438293457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1917.6316738128662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1258.6678314208984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1342.611198425293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1482.4416017532349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1720.6641674041748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1856.96928024292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1815.0046348571777, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1933.269920349121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2183.1913566589355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1266.679196357727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1325.452470779419, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1475.4838466644287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1707.745590209961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.832468032837, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1815.1012802124023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1931.3668727874756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2174.868803024292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1266.0057735443115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.5030374526978, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1480.5196857452393, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1712.7619075775146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1853.2225608825684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1816.4598369598389, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1924.2225646972656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2175.711679458618, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1263.549599647522, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1326.150245666504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1473.4009742736816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1709.3239879608154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1847.5390338897705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1811.7511940002441, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1914.3512153625488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2152.9620838165283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1762.9961681365967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1973.5915184020996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2806.9182205200195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2885.530414581299, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2226.8295860290527, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2266.8059253692627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3211.6020488739014, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3295.11137008667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1795.231056213379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1960.2460670471191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2831.8447971343994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2919.1649436950684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2226.222267150879, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2253.9065551757812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3214.8178005218506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3310.1345825195312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1809.108648300171, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1947.1249389648438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2841.1847972869873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2937.2872066497803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2229.363498687744, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2236.3116931915283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3219.3096256256104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3315.979804992676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1795.524492263794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1951.464958190918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2837.598237991333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2925.0332736968994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2229.7761631011963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2237.8563117980957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3218.9145374298096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3316.699962615967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1049.2107200622559, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1071.2718439102173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1108.029751777649, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1229.4878387451172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1170.725440979004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1114.7571182250977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1149.4355154037476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1250.3031921386719, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1021.492476463318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1028.8374376296997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1086.8889474868774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1192.2465658187866, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1185.2846431732178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1101.3500928878784, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1133.2726430892944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1232.7007913589478, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1021.5696096420288, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1029.5265626907349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1080.1041507720947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1189.688959121704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1181.3865518569946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1097.504644393921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1119.9983930587769, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1234.8043298721313, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1015.2828788757324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1029.4039916992188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1082.4987173080444, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1186.7692804336548, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1171.7518424987793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1090.1497650146484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1125.0076913833618, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1230.3971242904663, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1098.7814378738403, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1290.2457666397095, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1674.1484928131104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1704.6199893951416, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1201.496000289917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1266.765432357788, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1693.1745529174805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1749.5990371704102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1083.1881666183472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1251.7452764511108, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1550.8132791519165, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1621.9798517227173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.3945455551147, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1232.2880029678345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1679.4268608093262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1753.7819194793701, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1083.342866897583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1248.0281591415405, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1551.6948699951172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1635.6678581237793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1207.8156757354736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1223.6540842056274, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1672.1687984466553, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1775.3828811645508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1082.9936027526855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1243.1292724609375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1542.17520236969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1629.67679977417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1205.3932809829712, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1218.94287109375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1667.2649574279785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1758.2412910461426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1728.3876705169678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2251.2641620635986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1474.0286493301392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1972.3675155639648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1675.532627105713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2198.624143600464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1452.8283262252808, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1999.5867156982422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1690.782871246338, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2193.932647705078, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1441.0691213607788, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.0332870483398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1685.5443286895752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2194.9545574188232, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1442.394404411316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1977.8499126434326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 970.0470399856567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 997.0609664916992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1072.8740692138672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1248.5355234146118, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1031.5075254440308, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 984.0806341171265, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.7641525268554, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1198.475193977356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 950.4982328414917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.9567937850952, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1019.9463891983031, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1141.3403129577637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1027.8775930404663, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 981.6134405136108, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1007.3108768463135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1205.0265645980835, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 951.2828826904297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 980.0927972793579, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.1099262237549, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1140.5527973175049, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.2049512863159, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.2481517791748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1009.7663974761963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1197.27135181427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 941.0833597183228, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 970.2332782745361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1016.3004875183105, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1138.6312007904053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1021.0259103775023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 974.6134424209595, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1001.2521648406982, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1199.042239189148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1044.4852781295776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1448.9542436599731, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1472.3787260055542, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1119.3969535827637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1284.0489721298218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1319.4139194488525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1030.494885444641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1308.4395265579224, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1311.0385608673096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1094.4731187820435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1207.2350454330444, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1228.1020879745483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1037.1731233596802, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1300.8471965789795, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1303.5825490951538, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1095.0407981872559, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1202.0291137695312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1237.1227169036865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1028.5054445266724, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1298.2577562332153, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1289.831042289734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1085.804796218872, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1204.1727924346924, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1224.199833869934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2830.60001373291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1908.4585571289062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2821.8806552886963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1704.8300552368164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2823.439989089966, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1700.1129627227783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2822.421417236328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1693.81760597229, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1055.8260822296143, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1122.2932720184326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.8988733291626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 994.9108743667603, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.9841585159302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1003.8108825683594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1029.577922821045, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1030.7419157028198, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1022.4097585678101, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.6831941604614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 966.933913230896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.3617658615112, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.6419191360474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.8609580993652, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1015.1004791259766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.5182361602783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 977.7567958831787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.3683137893677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1025.8622407913208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.6502466201782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1003.0964756011962, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 964.716968536377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 967.7892780303955, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.720009803772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1538.6998558044434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1461.3667249679565, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1401.9203281402588, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1329.8638439178467, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1414.4491243362427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1319.8528003692627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1392.3977613449097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1304.7816038131714, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 944.4953536987305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 921.1899089813232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.8500728607178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1027.9425525665283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1248.3094453811646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1144.661283493042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1161.59423828125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1230.2307224273682, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.8316783905029, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 934.6414566040039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 982.3657655715942, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1043.7963247299194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1224.8020887374878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1137.7555227279663, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1173.082389831543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1222.1680116653442, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 953.5150289535522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 942.4519872665405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 981.3996696472168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.058879852295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1221.3108777999878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1138.8548803329468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1172.755208015442, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1220.6654262542725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.6423978805542, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.8123188018799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 983.2894420623779, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1055.212163925171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1218.5444736480713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1145.8080053329468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1168.4464025497437, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1225.6032037734985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1073.4163188934326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1026.2518405914307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.492483139038, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1659.54110622406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1236.7595148086548, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.9014434814453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1295.489592552185, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1745.9214115142822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1094.9641704559326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.546877861023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1164.395203590393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1676.9783973693848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1230.4492855072021, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1154.3604850769043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1276.6979217529297, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1756.736183166504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1098.8934421539307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1039.5990371704102, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1177.2708749771118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1674.8657703399658, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1216.3171243667603, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1154.558401107788, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1281.4539241790771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1770.5393695831299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1098.1537628173828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1038.7574291229248, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.7703981399536, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1675.3443336486816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1240.3358364105225, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1163.2436847686768, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1287.349443435669, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1772.5182437896729, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1483.6267232894897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2058.093433380127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2138.203344345093, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1488.0023956298828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1934.7883033752441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2011.0975837707522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1501.5256023406982, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2074.6416091918945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2159.3063926696777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1523.474555015564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1944.3254375457764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2013.9351940155027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1496.3699197769165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2065.676803588867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2159.945125579834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1523.845772743225, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1939.9772930145264, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2017.0648097991943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1509.3516778945923, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2076.5209579467773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2157.4324703216553, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1532.1574449539185, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1928.3809661865234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2026.8329524993899, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.9905681610107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 782.8438425064087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.8652782440186, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.9108839035034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 912.6281642913818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.8228783607483, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 870.9828805923462, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 918.4670352935791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.9142417907715, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 785.466878414154, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 791.5721607208252, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.8796939849854, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 908.6244821548462, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 840.716962814331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 859.8980808258057, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.2230348587036, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.012638092041, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.0084824562073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 800.4503989219666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.0716829299927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 910.9155225753784, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 830.6192016601562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.8899230957031, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.5508832931519, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 838.1491088867188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.7686381340027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.2316780090332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 857.2512054443359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 916.290397644043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.8497676849365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.4774417877197, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.3064050674438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.7127981185913, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 904.6132898330688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1161.0614252090454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1191.8912029266357, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.157917022705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 903.1121587753296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1200.6313610076904, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1231.362886428833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1025.4160022735596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 912.3614358901978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1153.566074371338, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1203.8488101959229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1051.1552000045776, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 904.2190456390381, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.6700716018677, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1228.632001876831, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.8041677474976, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 906.7972755432129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1155.9976053237915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1208.2734441757202, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.1409721374512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 908.6545562744141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.9870443344116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1229.4998359680176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.619517326355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 907.6220798492432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.5985660552979, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1203.9919996261597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1051.995997428894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 899.6364688873291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1172.9750490188599, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1224.14559841156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1271.17600440979, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1871.4967823028564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1560.0763273239136, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1430.594882965088, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1283.8611221313477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1842.8488159179688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1568.6601543426514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1429.5713520050049, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1279.6668720245361, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1839.9696063995361, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1567.2684717178345, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1429.3268775939941, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1280.3593587875366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1837.193603515625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1569.7999954223633, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1424.4983959197998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.4009618759155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.6667165756226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 884.2187213897705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 877.4310350418091, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.7758502960205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 748.5017585754395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 858.6201667785645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 867.6632070541382, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.961916923523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.5412788391113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 884.1987133026123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 871.1969566345215, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 884.7262334823608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.1065592765808, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 873.3092784881592, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 867.4163246154785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.946400642395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.021915435791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 888.6913728713989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.2224073410034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 885.1534366607666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.3604822158813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 877.9908752441406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.2334442138672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.2582406997681, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.397442817688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.8137636184692, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.7820701599121, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 881.8862390518188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 749.9992036819458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 868.6065578460693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 864.4028854370117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 920.4859161376953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1009.4046401977539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 974.5896053314209, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 964.6912050247192, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 895.863676071167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 994.3841505050659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.8822498321533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.9955263137817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 900.0070285797119, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 996.6476678848267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 991.2620830535889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 953.3940839767456, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 897.6777648925781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1000.6913566589355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 990.504322052002, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 954.9004793167114, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2263.038558959961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1323.576636314392, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2301.4603233337402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1264.331521987915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2302.694854736328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1267.673602104187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2307.870569229126, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1258.8591957092285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 787.6683187484741, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 817.1161603927612, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.6371250152588, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 981.700005531311, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 736.1006379127502, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.3566384315491, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.0086398124695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 817.7670288085938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.150707244873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.6063966751099, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.4759993553162, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.3863978385925, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.251359462738, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.1078481674194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.7395105361938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1034.12832736969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.8644785881042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.936324596405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.0902433395386, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 815.7070446014404, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.5142450332642, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1038.333592414856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 759.4526362419128, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.155993938446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1339.7724771499634, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.555835723877, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1346.8232107162476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 946.7956829071045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1346.158561706543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 943.0287933349609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1353.8382387161255, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 941.4694404602051, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 845.9726428985596, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 775.5967998504639, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 841.9164848327637, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1079.2651176452637, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.840163230896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 870.3110456466675, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.1449527740479, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1197.0763111114502, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.4366397857666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.570240020752, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 856.6934299468994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1082.4855852127075, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.4817638397217, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 880.2841663360596, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 948.8760042190552, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1192.2329568862915, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 870.0771236419678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 813.8033580780029, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 858.876805305481, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1093.7046384811401, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 942.4694442749023, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.0011186599731, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 945.2363300323486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1203.0700874328613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 869.0003156661987, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.8844747543335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.8700838088989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1091.2603187561035, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.47057056427, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 878.6203193664551, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 944.4849634170532, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1203.243522644043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1033.3436679840088, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1207.5406312942505, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1173.7708759307861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1192.347526550293, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1320.9932851791382, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1330.1750421524048, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.3980731964111, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1230.4545593261719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1167.5486421585083, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1196.5935945510864, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1326.1436891555786, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1356.0465621948242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1045.1977586746216, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1235.4233598709106, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1178.8600063323975, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1208.9457607269287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1353.184962272644, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1351.0628843307495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1053.8259315490723, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1235.1115226745605, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1184.0790367126465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1202.9614400863647, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1341.5307140350342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1351.109766960144, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1668.6043167114258, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1701.7046356201172, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1702.029619216919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1759.364309310913, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1714.9470329284668, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1757.2215843200684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1700.3249549865723, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1744.2187118530273, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.0782399177551, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.1671957969666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 817.7572774887085, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 794.7724795341492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.5425634384155, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.604642868042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 896.964168548584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 887.2455978393555, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.5985608100891, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.5459160804749, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.7412910461426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 792.7508807182312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.1131205558777, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.0655989646912, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 887.9617643356323, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 883.8465738296509, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.147358417511, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.3033547401428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 802.4652886390686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 786.8052816390991, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.2721586227417, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 701.9630408287048, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 890.749921798706, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.4563150405884, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.8739199638367, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.4007964134216, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.9575939178467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.403844833374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.8159928321838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.9315156936646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 879.440803527832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.3428821563721, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1108.8305568695068, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.3284816741943, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1241.6048002243042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.4681720733643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1107.2003173828125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 881.5844631195068, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1247.4438428878784, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.5788803100586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1107.2967958450317, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.1123237609863, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1258.9489603042603, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.6195116043091, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1114.2772769927979, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.8384037017822, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1256.9084692001343, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.7646503448486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.7620806694031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.1451177597046, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.1793661117554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.5108728408813, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 689.3284797668457, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.8601622581482, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 650.4443168640137, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.124801158905, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.8396754264832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 650.0739216804504, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.7012825012207, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.0360040664673, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.6044821739197, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.7734422683716, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.2761583328247, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.8214454650879, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.6916809082031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.6492805480957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.2700819969177, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.9756808280945, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.0694398880005, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.4041619300842, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.0518345832825, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.7038397789001, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 864.3414306640625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.0956792831421, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 876.8566417694092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.1678419113159, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.2503900527954, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 848.4531307220459, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 885.897912979126, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.7019271850586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 847.0667171478271, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.1308836936951, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.3103971481323, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.1363224983215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 838.4364652633667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 716.8809580802917, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.9019193649292, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.8057599067688, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.3824005126953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1066.811842918396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 906.6174411773682, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 899.2847967147827, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 974.1414451599121, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1033.1201601028442, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 884.8302412033081, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 883.7942409515381, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.6883153915405, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1063.9334392547607, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 912.336163520813, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 893.0531215667725, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 980.0006341934204, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1033.5857677459717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 890.5939102172852, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 878.7131214141846, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 996.466236114502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1067.798252105713, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 921.65696144104, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 902.8363132476807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 982.524471282959, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1036.8795156478882, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 897.1425676345825, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.0582437515259, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1003.8440036773682, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1072.005763053894, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.3512048721313, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 904.8819208145142, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 988.6751985549927, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1035.290560722351, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 903.3412790298462, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 886.4716863632202, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1329.116153717041, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1283.4150457382202, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1339.9849557876587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1289.2715072631836, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1342.231035232544, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1291.8219137191772, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1349.4299173355103, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1297.330241203308, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.3017587661743, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.5856022834778, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.9895949363708, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.56720495224, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.6203184127808, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.1225638389587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.2948794364929, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.2297639846802, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.5073585510254, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.5872006416321, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.2993545532227, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.3057541847229, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.454562664032, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 749.0086364746094, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.6479978561401, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.7281637191772, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.5769605636597, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.1873574256897, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.3672018051147, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.7321577072144, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.7396841049194, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.5985593795776, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.3025612831116, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.0387234687805, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 969.3411254882812, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 859.346079826355, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.6574430465698, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.6302471160889, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 967.3595142364502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 857.2313642501831, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 988.6204767227173, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 868.9715194702148, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.5121660232544, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 697.481279373169, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 774.8774480819702, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.3443264961243, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 858.4241580963135, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.9556841850281, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 781.6708827018738, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.3571176528931, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.6891202926636, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.0371189117432, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 784.1585612297058, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.6140828132629, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.3204832077026, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 721.0715198516846, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 796.3966393470764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.4879965782166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2717.9745769500732, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.3230409622192, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2699.128303527832, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.7743978500366, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2709.370880126953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.9462385177612, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2731.365451812744, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.8323197364807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2076.4281463623047, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1457.1595239639282, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2097.941131591797, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1478.0262327194214, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2083.8921642303467, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1455.8040046691895, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2107.654552459717, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1482.8478288650513, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2091.898708343506, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1463.9102411270142, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2113.7078285217285, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1491.2033605575562, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2095.1006412506104, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1478.0684804916382, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2118.473119735718, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1499.1064023971558, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1905.901107788086, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1274.986081123352, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1906.136178970337, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1281.6697692871094, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1920.943193435669, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1285.9129619598389, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1945.6445026397705, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1290.278401374817, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7693.116188049316, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1036.5203142166138, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7675.943145751953, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1045.0107145309448, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7711.6047286987305, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1043.6899137496948, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7772.8369140625, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1056.410574913025, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3398.1664276123047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3375.6281661987305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3467.9852867126465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3659.65389251709, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4979.245738983154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4937.603549957275, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4985.6013107299805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5119.582862854004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3359.736328125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3379.3026161193848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3461.6123008728027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3615.9483337402344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5002.736167907715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5040.7399559021, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5092.132034301758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5243.2403564453125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3316.3067054748535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3369.159393310547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3449.480667114258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3598.1529426574707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5027.1360206604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5043.435821533203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5120.320644378662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5268.351173400879, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3298.319206237793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3358.3537673950195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3430.9513664245605, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3578.1862449645996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5025.854225158691, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5059.779815673828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5117.811489105225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5271.095542907715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3483.228645324707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3669.793472290039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4052.63614654541, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4730.128307342529, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4969.440116882324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4904.541282653809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5213.0596923828125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5982.516326904297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3422.6590156555176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3600.3073501586914, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3924.7011375427246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4625.71403503418, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4952.659015655518, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4925.298900604248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5185.362358093262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5933.870868682861, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3395.7278442382812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3580.5932998657227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3956.465129852295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4595.521926879883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4958.760833740234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4938.946552276611, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5199.3218994140625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5954.500961303711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3378.9064407348633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3558.86287689209, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3929.863815307617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4564.4440269470215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4956.67293548584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4910.14892578125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5218.565444946289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5984.881420135498, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4829.424667358398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5433.060321807861, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7806.777114868164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8017.152519226075, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6049.528961181641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6209.582901000977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8967.93815612793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9176.490364074707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4776.638412475586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5285.107326507568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7859.498329162598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8068.514213562011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6004.361743927002, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6197.78959274292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8971.304893493652, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9194.435691833496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4811.991539001465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5260.808944702148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7882.068824768066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8095.280799865723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6053.629627227783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6205.10046005249, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8991.76399230957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9224.360389709473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4807.774562835693, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5241.690616607666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7886.771087646484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8113.81248474121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6043.034381866455, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6202.785606384277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9000.541343688965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9221.157264709473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2861.185464859009, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2899.6649742126465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3039.9782371520996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3371.622085571289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3189.772663116455, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3074.2504024505615, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3152.0294284820557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3387.3132133483887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2674.692335128784, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2741.5833473205566, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2873.1006240844727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3182.318878173828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3165.7116985321045, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2923.6572647094727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2999.93070602417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3241.220169067383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2631.664161682129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2701.073589324951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2865.827522277832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3164.26176071167, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3143.0513763427734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2881.092004776001, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2998.9297771453857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3238.419075012207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2610.135660171509, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2679.1004943847656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2848.853931427002, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3141.5671825408936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3113.8391971588135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2874.32222366333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2992.4087810516357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3231.563186645508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2998.3241748809814, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3538.88126373291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4541.369152069092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4591.915645599365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3285.662250518799, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3416.860828399658, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4514.179973602295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4756.4812660217285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2927.3929595947266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3329.757614135742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4065.9684944152837, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4292.513599395752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3211.86767578125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3192.1806144714355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4474.955062866211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4675.568618774414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2908.5465717315674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3317.319164276123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4074.192638397217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4305.4304122924805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3193.121757507324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3178.0118560791016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4489.347724914551, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4689.27282333374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2889.308786392212, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3298.177604675293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4065.5955123901367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4299.972667694092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3173.1617736816406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3168.003349304199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4484.960670471191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4688.771991729736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4707.722549438477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6129.013919830322, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3971.7742919921875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5325.346088409424, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4491.353759765625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6070.331707000732, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3809.7813034057617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5318.385791778564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4495.207462310791, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6050.131034851074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3848.9816665649414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5319.352264404297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4480.509128570557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6053.228282928467, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3830.306911468506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5331.415042877197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2645.3375911712646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2740.9164905548096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2942.057590484619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3396.282215118408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2791.8868732452393, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2684.8777770996094, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2787.807502746582, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3316.223030090332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2545.5401611328125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2587.7246475219727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2703.953561782837, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2969.3715286254883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2682.7446365356445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2565.7172775268555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2704.6086502075195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3180.517120361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2505.807695388794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2560.28302192688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2683.8115215301514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2972.8806495666504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2659.173765182495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2542.5372886657715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2683.1230449676514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3173.1118488311768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2492.860326766968, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2540.917615890503, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2672.7654552459717, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2966.6897583007812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2651.095190048218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2526.7064094543457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2670.849094390869, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3172.3654556274414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2822.297592163086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4021.46240234375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4006.723804473877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2985.1467418670654, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3496.418914794922, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3583.0467224121094, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2690.296154022217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3275.2033615112305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3365.4108810424805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2827.7331161499023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3144.350709915161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3245.333938598633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2664.6996688842773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3274.598560333252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3357.90225982666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2823.8379096984863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3141.2028789520264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3239.948310852051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2656.415672302246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3273.0588912963867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3344.326515197754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2819.5796871185303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3142.66752243042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3245.0380897521973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7468.764305114746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5136.006107330322, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7173.5515213012695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4534.069080352783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7207.119598388672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4524.626083374023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7209.914016723633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4528.137454986572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2775.597610473633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2904.4750022888184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3118.497905731201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2689.8905754089355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2620.643539428711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2736.3576126098633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2605.3681564331055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2575.7755088806152, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2597.467851638794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2484.5435333251953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2382.88911819458, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2440.0817489624023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2588.324022293091, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2577.5216102600098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2589.388647079468, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2428.8852882385254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2383.675193786621, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2429.655990600586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2588.593759536743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2573.782091140747, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2580.9030532836914, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2406.85152053833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2379.979200363159, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2426.647367477417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3961.3167762756348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3724.070415496826, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3542.637462615967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3252.7791786193848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3569.930839538574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3279.5262145996094, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3582.5556564331055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3281.7317962646484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2374.9667358398438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2363.015537261963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2469.046573638916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2636.2950801849365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2933.1019020080566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2786.2177658081055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2857.806558609009, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2981.432809829712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2458.396472930908, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2414.6208000183105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2538.7096214294434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2695.8969402313232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2894.0816020965576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2809.7070503234863, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2849.927349090576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2975.6071949005127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2468.083028793335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2424.6228790283203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2539.232635498047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2698.7817764282227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2902.682867050171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2804.1208171844482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2862.282419204712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2978.3506965637207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2473.724822998047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2419.9643230438232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2534.3817615509033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2698.1713676452637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2899.4129753112793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2807.589912414551, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2855.5313682556152, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2981.5920066833496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2663.2795238494873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2625.1976203918457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2965.526885986328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4074.348964691162, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2983.815870285034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2827.1772956848145, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3149.468011856079, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4402.744312286377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2756.571521759033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2653.8516807556152, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2971.174077987671, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4121.101474761963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3044.738073348999, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2848.9119720458984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3129.304962158203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4454.999847412109, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2760.1497554779053, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2649.9431800842285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2973.3580684661865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4139.254055023193, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3044.8459148406982, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2856.2953662872314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3123.3929443359375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4458.900909423828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2767.0831966400146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2652.226400375366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2967.517442703247, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4114.990711212158, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3038.4968090057373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2857.829761505127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3135.0531005859375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4468.111057281494, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3609.7054481506348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5193.585109710693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5422.665119171143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3643.6180877685547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4903.718891143799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5064.913959503174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3677.795524597168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5217.720584869385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5441.573429107666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3727.468032836914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4881.699199676514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5077.261753082275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3680.0318336486816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5218.181610107422, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5454.44128036499, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3750.2574729919434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4878.147830963135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5091.23104095459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3680.569267272949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5216.019382476807, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5455.9124755859375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3753.722038269043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4876.273937225342, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5101.654090881348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2073.2859230041504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1901.355218887329, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1954.589605331421, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2138.2076930999756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2206.1382484436035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2030.9395313262942, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2087.3702430725098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2266.5113735198975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2087.238712310791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1904.84769821167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1958.400478363037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2128.4636878967285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2216.427354812622, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2033.6316871643066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2086.618871688843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2273.0257987976074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2082.1815967559814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1904.2384147644043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1954.8950290679932, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2130.430564880371, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2203.6538982391357, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2035.9676837921143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2082.168016433716, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2284.886884689331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2079.059371948242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1902.9017639160156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1953.128957748413, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2125.4708862304688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2205.1587295532227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2034.4441604614256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2079.591999053955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2274.364004135132, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2535.150566101074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2259.431505203247, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2845.5051136016846, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2914.4007873535156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2463.1679821014404, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2234.590082168579, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2912.757921218872, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3021.9435024261475, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2569.222402572632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2258.2121562957764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2799.4305419921875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2899.363832473755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2474.388484954834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2201.64927482605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2873.486557006836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2988.3344078063965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2560.309133529663, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2258.278570175171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2794.201774597168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2906.28849029541, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2477.816162109375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2197.915687561035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2874.9288177490234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2992.1313762664795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2554.886713027954, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2246.7395210266113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2799.214868545532, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2908.0331134796143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2491.225748062134, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2195.72735786438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2878.4177589416504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2991.6307163238525, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3188.3969688415527, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4585.373268127441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3781.713581085205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3472.1510696411133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3110.698719024658, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4583.545951843262, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3819.6694374084473, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3483.972969055176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3115.3204822540283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4617.675189971924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3830.5409622192383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3495.5118560791016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3122.009925842285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4619.451847076416, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3831.1009216308594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3493.672504425049, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1774.6553707122803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1732.4222660064697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2023.433427810669, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2105.2148723602295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1926.328945159912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1747.3659229278564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2028.6406517028809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2068.9443016052246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1777.6380729675293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1697.9140949249268, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2001.6564655303957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2006.9626998901367, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.6582489013672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1727.123498916626, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2048.303689956665, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2064.1391944885254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1765.5036926269531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1697.0948791503906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1996.7046356201172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2014.450874328613, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1986.7118167877197, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1724.1163158416748, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2043.101444244385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2070.168466567993, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1767.9752159118652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1690.178565979004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2001.7927932739258, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2017.6481437683105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1986.7012786865234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1716.0296058654785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2056.33056640625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2071.194849014282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2280.185432434082, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2381.10463142395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2327.71183013916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2345.991849899292, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2227.5305461883545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2355.4878520965576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2318.7060832977295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2318.8564682006836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2210.6049728393555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2354.4163131713867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2321.523332595825, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2322.78431892395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2207.3196983337402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2352.5390243530273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2321.721782684326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2322.556962966919, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5269.579048156738, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3226.089630126953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5280.382270812988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2994.468011856079, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5306.164970397949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2998.841257095337, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5340.772190093994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3000.4952144622803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1736.9003009796143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1817.867841720581, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.2942428588867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2204.0719985961914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1571.3019180297852, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1676.655511856079, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1669.9276638031006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1778.340015411377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1801.4459037780762, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2299.9059009552, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1633.461594581604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1641.5945720672607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1657.5297832489014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1788.0214500427246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1803.341121673584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2307.8664016723633, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1628.8083171844482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1653.2007884979248, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1656.5063953399658, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1783.9534282684326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1796.2208080291748, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2300.9156608581543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1624.0582466125488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1649.1795063018799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3047.9569721221924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2316.324167251587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2988.1812858581543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2133.7673664093018, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2990.3555488586426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2142.683343887329, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3006.4640140533447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2144.823989868164, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1968.0879878997803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1843.1980800628662, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1965.6065464019775, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2297.1195220947266, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2031.2476921081543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1970.7147216796875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2077.2993659973145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2520.996160507202, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2059.16880607605, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1986.493787765503, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2088.6003398895264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2350.47438621521, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2114.6878242492676, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2054.633913040161, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2168.411512374878, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2555.338888168335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2078.6731338500977, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1988.6934280395508, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2088.3039951324463, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2353.251190185547, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2109.0505695343018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2056.8019104003906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2167.594585418701, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2577.12495803833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2084.486885070801, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1988.7305450439453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2086.6374492645264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2370.1675128936768, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2110.013608932495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2056.52174949646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2166.485776901245, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2594.452476501465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2260.727834701538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2539.962863922119, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2508.5123252868652, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2491.8990516662598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2810.4096031188965, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2869.3971157073975, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2418.5827255249023, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2629.3460750579834, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2641.023349761963, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2531.2494373321533, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2830.636339187622, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2872.23087310791, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2423.986883163452, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2645.1630306243896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2639.1614151000977, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2532.3561477661133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2830.993137359619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2881.0789012908936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2424.3142414093018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2679.1438388824463, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2639.4916915893555, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2547.5068759918213, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2849.4545555114746, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2885.873441696167, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3568.21439743042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3626.3113594055176, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3677.0099449157715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3907.2679710388184, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3716.8251419067383, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3905.219192504883, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3793.213596343994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3907.347011566162, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1440.1521587371826, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1458.5476779937744, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1645.1446533203125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1683.3689403533936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1552.6040029525757, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1485.0561618804932, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1873.8601684570312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1888.633918762207, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1471.478238105774, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1467.3521614074707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1641.1303901672363, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1640.5348873138428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1581.9297647476196, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1483.4915208816528, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1885.3987216949463, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1892.7788829803467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1466.513442993164, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1466.6603136062622, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1645.2782154083252, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1642.2620868682861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1580.8089590072632, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1478.3070468902588, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1876.915683746338, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1898.0444622039795, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1468.7383937835693, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1467.1684789657593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1654.6161556243896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1647.7076816558838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1579.432315826416, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1474.7764825820923, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1881.3409423828125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1901.9969367980957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2239.4564723968506, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1791.3531303405762, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2586.477117538452, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2057.7115058898926, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2309.9473571777344, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1789.504976272583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2661.491184234619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2058.5496044158936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2323.161449432373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1791.2844944000244, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2663.17120552063, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2063.3236694335938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2339.2657375335693, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1804.5136070251465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2669.210557937622, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2065.5083179473877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1232.027039527893, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1318.1190490722656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1317.3673486709595, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1269.2607975006104, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1309.6385526657104, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1291.9852876663208, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1257.6239919662476, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1320.0656032562256, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1274.7187089920044, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1288.316798210144, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1316.1670446395874, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1253.4862327575684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1251.3371229171753, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1314.6345663070679, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1285.5884838104248, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1283.6652755737305, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1318.3768033981323, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1259.4539213180542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1253.532633781433, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1326.0905504226685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1289.2579221725464, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1284.3487977981567, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1318.4723281860352, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1264.8073625564575, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1751.1100769042969, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1707.0840072631836, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1777.3012828826904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1719.8964881896973, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1778.2929801940918, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1721.4891147613525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1786.482572555542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1726.038408279419, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1579.8164749145508, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1382.9824018478394, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1618.8468837738037, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1412.4438428878784, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1625.344476699829, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1417.297601699829, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1639.138422012329, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1414.4838380813599, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1911.3032245635986, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1967.8265571594238, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1752.2313499450684, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1803.6036777496338, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1897.3329639434814, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1926.4302253723145, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1710.7283210754395, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1766.4305400848389, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1995.8062553405762, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2069.9638271331787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1959.3830299377441, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1979.8966217041016, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1975.342903137207, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2003.9855957031252, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1900.9603214263916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1920.981912612915, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1997.518720626831, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2086.5457725524902, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1958.3865642547607, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1982.981767654419, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.9335842132568, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2009.5001602172854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1893.1991863250732, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1918.327522277832, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2006.7752265930174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2093.8532733917236, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1958.1025695800781, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1985.1593494415283, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1992.9534530639648, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2025.5969619750974, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1896.0628700256348, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1918.2876777648926, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2485.310583114624, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2392.2119998931885, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2708.517904281616, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2613.7056064605713, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2739.5281505584717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2627.535991668701, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2749.733934402466, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2627.3576068878174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1312.4516868591309, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1371.1948776245117, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1270.139832496643, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1263.5889625549316, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1305.927677154541, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.359040260315, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1343.798713684082, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1385.2571296691895, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1310.3020858764648, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1291.3950490951538, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.0172834396362, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1247.3668766021729, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1345.3529596328735, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1385.2844858169556, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1319.5417547225952, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1294.631519317627, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1328.6287927627563, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1260.3399991989136, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1364.982409477234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1405.8966398239136, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.6868772506714, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1312.0347356796265, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1341.5497636795044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1269.441270828247, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1848.6526489257812, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1629.4103908538818, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1871.0152053833008, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1676.5619087219238, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1874.618215560913, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1676.3841533660889, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1894.251365661621, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1689.7099018096924, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1476.5927982330322, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1110.1559972763062, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1330.5219173431396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1004.140796661377, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1501.026725769043, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1125.6663942337036, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1354.266881942749, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1019.5953702926637, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1508.3126401901245, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1133.9270496368408, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1352.785120010376, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1026.3347148895264, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1534.6483182907104, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1166.9036865234375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1380.561923980713, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1042.5006341934204, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5522.248821258545, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1113.4160041809082, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5470.490398406982, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1125.6692743301392, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5484.120445251465, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1136.4676904678345, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5520.324821472168, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1159.1534423828125, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2378.0892753601074, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1687.1630477905273, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2392.27313041687, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1698.1086158752441, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2512.5012588500977, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1811.3804912567139, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2538.1628704071045, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1804.7070598602295, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2522.376136779785, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1819.1190338134766, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2555.8112239837646, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1805.8307266235352, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2550.4505825042725, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.9092769622803, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2573.529920578003, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1825.0847816467285, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2023.0862617492678, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1507.276315689087, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2053.344955444336, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1537.781286239624, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2122.2268676757812, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1551.8648052215576, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2237.440004348755, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1632.534580230713, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8713.604316711426, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1220.901608467102, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8574.843406677246, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1244.5124912261963, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8596.102027893066, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1268.5089540481567, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8652.186546325684, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1299.2396926879883, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6719.045829772949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6646.787338256836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6822.736511230469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7198.8043212890625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9677.865180969238, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9703.671226501465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9818.126602172852, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10076.600303649902, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6609.2461013793945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6655.421257019043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6807.887229919434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7085.511932373047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9732.455673217773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9927.391128540039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10030.67813873291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10281.354789733887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6485.452919006348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6611.485710144043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6747.314453125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6970.997009277344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9748.673858642578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9867.727699279785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9958.968276977539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10270.403823852539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6382.735500335693, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6559.672164916992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6699.045562744141, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6900.298919677734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9748.525924682617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9864.921913146973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9995.019302368164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10280.852813720703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6825.440444946289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7223.251495361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7947.279357910156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9303.040809631348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9681.023712158203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9596.994743347168, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10191.981315612793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11701.329498291016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6696.771507263184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7077.397804260254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7618.75057220459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9020.933380126953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9724.002075195312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9601.309242248535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10163.078880310059, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11606.125602722168, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6585.039100646973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6958.767623901367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7512.055206298828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8863.439254760742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9687.592086791992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9602.247543334961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10194.06509399414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11706.768798828125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6529.177284240723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6899.077339172363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7474.810562133789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8827.585792541504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9687.701606750488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9602.64144897461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10169.466819763184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11735.445899963379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9477.139663696289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10695.657348632812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15290.210266113281, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15674.352188110352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11773.2661819458, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12233.904457092285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17582.018432617188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17966.37222290039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9243.216171264648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10297.956352233887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15336.405029296875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15736.24641418457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11684.675407409668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12215.528450012207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17559.83039855957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17977.873992919922, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9164.625511169434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10182.287864685059, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15369.207153320312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15773.988800048828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11736.568374633789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12148.052673339844, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17595.061569213867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 18015.470428466797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9149.334564208984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10125.5904006958, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15400.078659057617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15822.32666015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11784.226951599121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12156.501388549805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17620.61851501465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 18044.183044433594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5675.832767486572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5698.177127838135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5975.10383605957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6618.336334228516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6171.899662017822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6066.244468688965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6220.200786590576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6645.095367431641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5273.587017059326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5402.191505432129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5646.9794845581055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6251.028804779053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6061.439208984375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5777.230854034424, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5890.69356918335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6332.887668609619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5129.495010375977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5286.825923919678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5573.205261230469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6154.095039367676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5995.698890686035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5721.798915863037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5908.659362792969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6315.314407348633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5041.260147094727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5259.460315704346, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5519.79362487793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6125.026073455811, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5976.546192169189, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5714.399166107178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5892.145481109619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6302.975978851318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5894.132957458496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6954.125137329102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8886.41887664795, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8952.797317504883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6392.960987091064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6671.162910461426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8813.133926391602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9337.667121887207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5698.531799316406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6447.576484680176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7924.637184143066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8369.136543273926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6173.743152618408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6264.206714630127, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8743.07674407959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9117.575721740723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5611.920680999756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6356.739044189453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7934.805641174316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8377.519340515137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6116.063995361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6204.092330932617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8743.186950683594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9119.889526367188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5566.061267852783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6338.58283996582, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7934.606513977051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8394.662551879883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6071.737442016602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6201.879234313965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8753.512344360352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9142.232971191406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9188.326034545898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12029.552192687988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7759.568176269531, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10425.077896118164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8711.600723266602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11884.332962036133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7467.669868469238, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10386.559562683105, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8649.6439743042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11911.956939697266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7401.120491027832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10385.019073486328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8624.097480773926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11920.186614990234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7429.005966186523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10401.572341918945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5242.619190216064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5362.594890594482, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5777.8839683532715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6670.206451416016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5444.042701721191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5267.865428924561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5436.658191680908, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6524.570274353027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4925.76530456543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4999.675884246826, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5297.648010253906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5717.866916656494, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5186.768817901611, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5078.205165863037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5308.9606285095215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6162.939872741699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4826.736145019531, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4905.82498550415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5236.624011993408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5716.376152038574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5110.968036651611, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5017.948169708252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5274.105796813965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6174.01424407959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4792.508163452148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4855.534687042236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5203.804988861084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5710.014667510986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5096.951522827148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4953.240985870361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5224.959354400635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6158.796844482422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5487.086582183838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7942.040672302246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7822.897109985352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5868.934917449951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6912.585296630859, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6989.776382446289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5174.915199279785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6361.993618011475, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6542.569770812988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5491.790885925293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6112.320308685303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6322.597770690918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5086.477298736572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6363.957786560059, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6526.952133178711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5421.695194244385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6107.9155349731445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6322.765789031982, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5067.203693389893, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6360.346431732178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6526.633529663086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5434.59924697876, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6110.4375648498535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6325.835647583008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14440.595092773438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10029.375038146973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13654.854125976562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8831.327857971191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13490.739364624023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8784.872779846191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13635.31265258789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8793.098983764648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5438.973579406738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5876.558570861816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6112.789287567139, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5267.197914123535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5274.315223693848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5352.274875640869, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5036.7919921875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4881.283855438232, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4955.075969696045, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4842.444686889648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4561.748313903809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4639.947700500488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4948.326072692871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4874.974060058594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4946.101150512695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4721.37565612793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4536.0846519470215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4649.232139587402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4942.004203796387, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4873.052978515625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4942.622852325439, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4680.519638061523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4532.869606018066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4647.036476135254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7762.73006439209, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7243.764915466309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6887.067565917969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6274.78572845459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6919.398880004883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6312.515525817871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6959.618301391602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6354.217758178711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4622.491874694824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4635.30553817749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4815.625591278076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5102.1173095703125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5546.114749908447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5374.777774810791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5499.715843200684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5727.480506896973, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4762.509899139404, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4740.871715545654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4904.640007019043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5184.524936676025, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5479.5676612854, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5418.470726013184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5554.413585662842, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5727.69588470459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4751.7206382751465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4738.061141967773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4903.37345123291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5179.840145111084, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5485.44620513916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5408.506565093994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5529.57950592041, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5700.348815917969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4757.808666229248, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4751.684169769287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4917.278347015381, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5181.759376525879, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5499.914436340332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5461.9610023498535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5587.750091552734, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5740.932846069336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5121.468772888184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5075.416164398193, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5739.204483032227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7846.584854125977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5739.576950073242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5431.143550872803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6058.545303344727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8488.752326965332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5238.2073974609375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5097.761116027832, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5702.49870300293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7878.4770584106445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5780.096168518066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5491.426048278809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6018.958568572998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8512.580184936523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5250.671329498291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5082.024211883545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5694.69690322876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7919.532051086426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5784.579372406006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5471.473426818848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6036.342372894287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8564.574279785156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5288.1340408325195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5090.164642333984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5668.336143493652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7917.037124633789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5795.811672210693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5504.110870361328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5994.868011474609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8560.474281311035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6933.077812194824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9977.87338256836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10447.89264678955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6998.233451843262, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9419.699249267578, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9716.440353393555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6991.3177490234375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10006.340827941895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10453.75747680664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7009.168014526367, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9375.206527709961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9745.266418457031, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7065.209732055664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10026.421699523926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10478.891372680664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7125.404891967773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9385.543937683105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9792.298126220703, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7063.996353149414, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10041.904258728027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10494.761810302734, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7111.309547424316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9399.676361083984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9774.037818908691, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3976.5615844726562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3661.437587738037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3802.838077545166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4128.471527099609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4233.222236633301, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3965.367965698242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4101.369171142578, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4347.764015197754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3982.5880241394043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3647.1345710754395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3763.55411529541, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4108.3514976501465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4254.422721862793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3899.9710655212402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4067.5547218322754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4355.852947235107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3977.1823692321777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3639.569969177246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3756.5689849853516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4102.888488769531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4246.247692108154, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3916.4363288879395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4063.869132995605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4354.857635498047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3966.131076812744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3652.851333618164, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3771.1635208129883, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4087.170925140381, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4235.361251831055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3896.888198852539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4048.754539489746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4368.2402992248535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4860.128269195557, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4339.653720855713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5431.69454574585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5583.976955413818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4665.000591278076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4324.3256187438965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5608.589458465576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5810.016174316406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4878.243026733398, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4247.850227355957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5350.406894683838, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5542.430839538574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4630.462207794189, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4282.983207702637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5511.516456604004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5707.33154296875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4890.0346755981445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4245.938529968262, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5352.949619293213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5560.164966583252, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4683.6944007873535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4238.619518280029, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5513.0682945251465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5727.06579208374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4875.992813110352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4254.8846435546875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5354.137725830078, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5553.392639160156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4698.7470626831055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4241.160469055176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5504.038066864014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5713.783416748047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6160.447177886963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8896.777877807617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7142.779083251953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6663.7260818481445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5955.458526611328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8861.645164489746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7241.336822509766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6665.782623291016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5975.148658752441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8893.209762573242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7256.1761474609375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6693.633346557617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5989.556312561035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8896.155548095703, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7316.997108459473, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6686.792411804199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3411.756172180176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3311.597900390625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3872.435531616211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4017.947502136231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3651.4743995666504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3343.945598602295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3790.4512214660645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3914.3822288513184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3369.582862854004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3199.1705799102783, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3732.6598358154297, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3764.6467208862305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3665.0406455993652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3255.735673904419, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3813.533306121826, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3874.554023742676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3361.0094261169434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3179.795846939087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3726.3537979125977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3765.1804542541504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3694.184799194336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3269.3628883361816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3837.036647796631, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3890.145778656006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3347.337589263916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3169.575662612915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3718.5826110839844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3782.865791320801, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3685.0535583496094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3274.3803787231445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3838.623790740967, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3874.7361755371094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4412.995338439941, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4529.288959503174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4403.776073455811, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4407.669315338135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4226.780014038086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4427.486553192139, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4245.452346801758, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4343.118095397949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4215.660171508789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4430.881462097168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4276.342086791992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4352.206382751465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4197.697582244873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4424.772644042969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4249.654407501221, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4348.68782043457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9956.268844604492, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6132.611827850342, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9914.452590942383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5721.67423248291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9915.113525390625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5726.709575653076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9963.97087097168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5745.419521331787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3288.0713844299316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3440.7533073425293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3533.184986114502, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4117.703990936279, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3020.559377670288, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3217.171401977539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3135.1529598236084, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3363.2597160339355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3428.556308746338, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4233.564605712891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3077.113914489746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3138.3896160125732, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3087.5153827667236, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3367.1657371520996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3415.7278442382812, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4304.258728027344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3058.9667224884033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3139.2878437042236, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3039.786729812622, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3359.0796661376953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3409.5737838745117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4302.924461364746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3076.761131286621, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3132.5169563293457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5709.120826721191, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4337.77811050415, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5501.945781707764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3949.434070587158, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5529.47904586792, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3974.8748779296875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5538.884315490723, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3989.529285430908, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3579.679374694824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3477.0164680480957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3738.2571601867676, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4172.719345092773, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3737.8482055664062, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3696.956615447998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3858.394241333008, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4523.578262329102, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3870.459041595459, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3734.9222373962402, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3910.4451179504395, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4386.444339752197, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3927.2436904907227, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3854.030227661133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4021.6636466979985, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4648.143367767334, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3917.4765014648438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3744.125270843506, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3912.363510131836, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4372.611408233643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3943.5300827026367, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3885.4993438720703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4039.8006439208984, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4611.114368438721, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3929.060935974121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3757.2817611694336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3933.6196517944336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4405.962390899658, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3971.8227005004883, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3995.2252769470215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4136.6657638549805, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4662.1452713012695, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4300.489978790283, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4640.682849884033, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4648.847255706787, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4427.051029205322, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5055.168476104736, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5191.24719619751, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4579.711971282959, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4833.220100402832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4886.607837677002, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4585.250225067139, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5087.9157066345215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5206.540508270264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4609.048328399658, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4840.1225662231445, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4887.937641143799, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4592.612934112549, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5114.92338180542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5207.694129943848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4592.380828857422, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4966.184329986572, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4901.796016693115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4608.4792137146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5195.35774230957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5222.2881507873535, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6292.737102508545, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6416.084156036377, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6681.443252563477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7235.70613861084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6843.65852355957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7262.9949951171875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7099.631423950195, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7125.032081604004, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2660.330228805542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2699.741430282593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3008.1691455841064, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3090.1102352142334, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2841.2643146514893, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2739.583044052124, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3314.29386138916, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3351.932792663574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2697.7091312408447, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2719.2323207855225, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2911.317768096924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3014.7740650177, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2900.0027179718018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2746.8052864074707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3317.6177406311035, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3337.038097381592, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2701.7580890655518, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2711.64701461792, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2929.2761611938477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3015.688304901123, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2889.711494445801, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2723.3454418182373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3315.72359085083, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3343.924789428711, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2689.4153594970703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2693.5574340820312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2975.6169509887695, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3062.254867553711, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2888.156156539917, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2696.095027923584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3328.581771850586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3340.1158332824707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3976.032199859619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3242.522602081299, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4588.773288726807, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3661.8167686462402, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4165.911712646484, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3171.815528869629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4771.067371368408, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3660.054931640625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4221.89245223999, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3169.11057472229, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4794.654693603516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3656.919403076172, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4282.778377532959, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3213.0689430236816, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4823.065624237061, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3661.6787147521973, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2250.163679122925, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2389.010238647461, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2427.599687576294, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2276.1079692840576, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2301.0649585723877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2359.819211959839, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2212.0164680480957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2294.3850994110107, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2225.3878116607666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2271.3478469848633, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2296.613130569458, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2195.966739654541, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2189.402551651001, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2299.847345352173, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2223.944625854492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2253.1158351898193, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2303.5876655578613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2190.855369567871, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2189.1111850738525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2315.421733856201, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2252.940788269043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2252.782096862793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2316.4827251434326, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2211.952495574951, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3048.3747005462646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2964.126558303833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3124.6545600891113, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3031.409730911255, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3150.1737689971924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3046.3764667510986, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3191.504487991333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3081.980972290039, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2653.1222343444824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2369.8475074768066, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2735.572328567505, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2435.5695819854736, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2742.113780975342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2421.7654418945312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2770.2352046966553, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2436.0139179229736, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3506.628475189209, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3608.0536460876465, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3371.234073638916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3577.9761505126953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3515.850601196289, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3516.7840003967285, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3316.903839111328, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3506.32869720459, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3858.8800048828125, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3948.9412879943848, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3792.635040283203, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3824.6449851989746, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3800.840301513672, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3813.681240081787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3674.298572540283, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3716.061420440674, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3886.4574241638184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3977.9373359680176, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3791.8465995788574, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3967.6192474365234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3823.4302139282227, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3834.3567848205566, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3667.2668838500977, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3859.7997093200684, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3906.6576194763184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4005.243988037109, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3789.136619567871, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4155.125885009766, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3843.2198333740234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3869.423007965088, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3676.29695892334, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4045.7315063476562, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4659.537754058838, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4484.645309448242, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5117.50452041626, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4955.887184143066, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5172.786407470703, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4974.456748962402, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5216.9169998168945, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5019.363479614258, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2496.210880279541, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2517.8284740448, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2396.563034057617, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2446.7660903930664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2379.775342941284, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2349.7417545318604, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2573.028335571289, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2571.6137504577637, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2496.100015640259, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2512.514228820801, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2459.3337535858154, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2386.9185638427734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2580.7145404815674, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2583.99582862854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2548.3105659484863, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2515.1895904541016, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2473.508176803589, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2437.9619312286377, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2594.046697616577, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2616.5042972564697, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2580.185146331787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2524.306221008301, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2493.4907245635986, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2472.1491050720215, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3462.219524383545, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2972.778091430664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3604.2043113708496, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3131.0430335998535, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3639.336452484131, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3161.7191791534424, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3681.749267578125, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3194.309787750244, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2761.0827255249023, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2083.7707138061523, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2475.4097652435303, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1823.7574291229248, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2826.931371688843, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2091.691026687622, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2516.9739151000977, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1826.8292713165283, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2845.4255962371826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2107.886390686035, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2531.731996536255, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1830.2620792388916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2886.7788696289062, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2124.3949031829834, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2532.0750427246094, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1843.4707164764404, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9357.18978881836, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1935.8806419372559, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9299.665985107422, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.0145473480225, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9339.782829284668, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1995.8995532989502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9433.947868347168, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2019.573745727539, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4189.321727752686, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3215.68660736084, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4252.872180938721, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3235.896167755127, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4701.358585357666, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3447.967052459717, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4758.719863891602, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3430.4079818725586, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4717.36701965332, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3460.278377532959, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4778.1086349487305, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3441.537628173828, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4823.398418426514, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3521.0648155212402, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4879.204940795898, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3506.706199645996, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3868.5687828063965, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2769.857921600342, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3918.009262084961, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2919.9692916870117, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4039.7025489807124, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2953.751850128174, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4260.711822509766, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3060.1609802246094, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15353.79753112793, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2124.056167602539, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15376.233596801758, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2176.843204498291, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15412.657318115234, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2215.5020904541016, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15426.503601074219, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2279.019536972046, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.45039999485016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 150.69792091846466, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 149.7355192899704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.03056073188782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.95344066619873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.68080008029938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.060959815979, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.80224061012268, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.10928070545197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.01664006710052, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.15584015846252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.48559999465942, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.52992033958435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.29471957683563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.09200143814087, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.99055922031403, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.7033599615097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.1153599023819, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.17295920848846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.90783989429474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.49520087242126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.0083191394806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.65439975261688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.8662406206131, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.72784006595612, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.49439918994904, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.37823963165283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.07344043254852, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.99295926094055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.76927947998047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.36784040927887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.74752008914948, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.42127990722656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.4313609600067, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.25791907310486, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.77423977851868, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.5366405248642, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.3931188583374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.56047928333282, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.1374410390854, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.86272037029266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.91935896873474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.21360063552856, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.21856009960175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.04879999160767, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.9203199148178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.05248022079468, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.78048050403595, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.9871997833252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.03648006916046, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.37999892234802, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.65424036979675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.96384024620056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.22560095787048, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 185.2950394153595, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.53231966495514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.79983949661255, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.01967930793762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.86303961277008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.01760005950928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.16432011127472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.55104076862335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.43856120109558, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.3924798965454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.37008094787598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.56496012210846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 230.8844769001007, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 230.60800075531006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.93551993370056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.59008073806763, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.19871830940247, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.59888100624084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.19296061992645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.89247858524323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 231.6652810573578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 231.54736042022705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.4027203321457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.43359971046448, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.83327770233154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.00016021728516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.25824058055878, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.32352018356323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.94415950775146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.33104038238525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.81968021392822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.57551956176758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.4515197277069, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.6992003917694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.36015856266022, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.26719880104065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 233.24703931808472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.65280055999756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.49711906909943, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.2491194009781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.32896256446838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.16896200180054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.69264030456543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.8857605457306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.9398386478424, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.15072095394135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.40432024002075, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.5068792104721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.16815972328186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.4772790670395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.5630396604538, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.1950399875641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.1499207019806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.70640003681183, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.53648054599762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.3998395204544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.1524807214737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.98399913311005, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.26672065258026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.55951976776123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.85712039470673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.31472027301788, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.61184084415436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.39295935630798, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.16511988639832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.1276797056198, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.66991865634918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.24351906776428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.010560631752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.34303975105286, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.00735998153687, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.10719847679138, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.199840426445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.44687938690186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.31727957725525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.80768084526062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.3873610496521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.91631960868835, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.51616048812866, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.96559989452362, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.90031898021698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.19999992847443, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.17680060863495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.08559834957123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.57824039459229, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.59632062911987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.72127985954285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.92304074764252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.01920127868652, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.90704035758972, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.1710386276245, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.22160017490387, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.5967993736267, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.1868795156479, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.22224009037018, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.82863926887512, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.09647965431213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.8220797777176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.36751890182495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.4454402923584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.51712048053741, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.59407937526703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.8430414199829, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.7148813009262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.44256055355072, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.89279973506927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.44048023223877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 204.4696009159088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.13263976573944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.49151873588562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.14208030700684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 200.76879978179932, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.95296132564545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.8542401790619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.1551994085312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 203.20144057273865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.4385598897934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 185.22816061973572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.55791974067688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 203.7329602241516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.5001586675644, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.52832078933716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.74160051345825, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.60736083984375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.7956793308258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.51119947433472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.56512022018433, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 150.91855883598328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.354079246521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.14991998672485, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.98976004123688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.9126387834549, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.73264026641846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.83071970939636, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.6275199651718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.50543999671936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.83551919460297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.77504110336304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.42975914478302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.38704097270966, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.93152022361755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.01104032993317, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.4513601064682, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.91376042366028, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.08207952976227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.8555190563202, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.99152040481567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.71023952960968, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.76879930496216, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.01855874061584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.0619193315506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.3384004831314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.10511946678162, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.87984085083008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.75295972824097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.15808033943176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.03152060508728, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.16528034210205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.6841596364975, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.0489593744278, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.21919977664948, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.06384015083313, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.29728066921234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.78608000278473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.76623928546906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.4524803161621, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.56639957427979, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.89151895046234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.12063896656036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.67711997032166, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.52255988121033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.38416051864624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.86464047431946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.4291205406189, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.11039912700653, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.26736080646515, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.52336061000824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.90527963638306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.579039812088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.1216002702713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.8436801433563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.86928033828735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.44528126716614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.8523187637329, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.23792338371277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.60656070709229, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.0004804134369, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.37855851650238, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.8857593536377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.63296175003052, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.0030403137207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.63072049617767, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.2297601699829, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.67119979858398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.9124791622162, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.50512146949768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.55184054374695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.19440078735352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.49727964401245, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.37599980831146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.25760090351105, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.9635202884674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.59504067897797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.15344083309174, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.53616058826447, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.9177609682083, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.34287905693054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.49551928043365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.4083207845688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.47408092021942, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.68063962459564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 185.73984026908875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.1723198890686, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.3742400407791, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.33616054058075, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.03407907485962, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.89024031162262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.23391997814178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.96016025543213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.91359865665436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.34271812438965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.62304186820984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.21487975120544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.9670408964157, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.75471985340118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.69840002059937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.1233607530594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.74863970279694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.4263995885849, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.10112011432648, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.45759975910187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.896479845047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.36655950546265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.75375962257385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.97776019573212, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.56959974765778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.4251207113266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.47439908981323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.0039985179901, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.14047861099243, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.53871977329254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.80736076831818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.13104021549225, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.0048007965088, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.28239905834198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.5307193994522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.6656002998352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.29392170906067, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.59440004825592, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 196.32783830165863, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.62688064575195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.86799955368042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.32800030708313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.2555195093155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.5619192123413, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.12976050376892, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.44992101192474, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.8102412223816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.6822406053543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.32912051677704, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.38991963863373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 200.06319761276245, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.0823996067047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.89967954158783, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.32176005840302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.55328011512756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.68655967712402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.67535960674286, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.22607898712158, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.72272372245789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.86719965934753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.46352005004883, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.32240045070648, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.2708775997162, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.3342399597168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.88031935691833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.73312056064606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.58175897598267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.95983910560608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.70463871955872, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.7838408946991, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.30447924137115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.78480052947998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.6644802093506, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.07919669151306, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.48752164840698, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.25311756134033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.39935839176178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.4579187631607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.26864171028137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.88159823417666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.4624000787735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 224.87695813179016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.32368111610413, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.20944106578827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.81952118873596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.83712244033813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.49727964401245, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 227.5519984960556, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.5512011051178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.51216113567352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.13535904884338, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.40959930419922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.63311982154846, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 227.93264091014862, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.2776017189026, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.743199467659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.2022407054901, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.12400043010712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.8691202402115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.7564799785614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.09503960609436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.60639917850494, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.13216030597687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.63296020030975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.82367980480194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.12992131710052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.8766404390335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.47664082050323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.93616092205048, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.984800696373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.757758975029, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.92400085926056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.69823813438416, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.0063999891281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.42656135559082, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.55247914791107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.61103999614716, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.29135990142822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.19727861881256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.7915209531784, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.5574390888214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.07968151569366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.80560171604156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.5083202123642, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.39759957790375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.0598406791687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.25311923027039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.8224000930786, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.68255925178528, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.7468799352646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.32255935668945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.59264087677002, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.27200043201447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.04255974292755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.90607941150665, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.24704158306122, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.91567969322205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.55248081684113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.17872047424316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.4097602367401, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.06048047542572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.67519783973694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.71871995925903, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.4862381219864, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.17551958560944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.40143883228302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.81695973873138, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.553280711174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.13776004314423, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.33488059043884, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.72944116592407, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.37648034095764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.55328047275543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.49967873096466, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.27728056907654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.9236809015274, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.89984214305878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.43728017807007, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.89904034137726, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.45871901512146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 231.47487878799438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 207.36495971679688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.82607913017273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.81967997550964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 230.35696029663086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.04048132896423, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.15471816062927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 185.25120079517365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 229.41807985305786, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.3483190536499, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.93311965465546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.84720063209534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 231.87552213668823, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.9705581665039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.8438402414322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.63039934635162, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.62480008602142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.74032056331635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.34880018234253, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.56303906440735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.83616018295288, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.4902399778366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.9276807308197, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.41120088100433, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.79808020591736, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.81232011318207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.91087925434113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.08816003799438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.1292805671692, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.9582403898239, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.33183991909027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.96863925457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.0087994337082, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.51327979564667, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.3899201154709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.47488141059875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.76576161384583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.35455989837646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.48287868499756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.28928089141846, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.15184140205383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.0643196105957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.66607999801636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.8403195142746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.78703904151917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.29919981956482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.48640024662018, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.83759808540344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.90367913246155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.05728149414062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.45280003547668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.10607981681824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.1484798192978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.722238779068, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.0113605260849, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.8334412574768, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.17631912231445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.28896045684814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.50672006607056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.0951999425888, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.9264007806778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.1619223356247, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.7431995868683, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 315.8030414581299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 186.55376076698303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 314.1470408439636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.68752002716064, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 313.718878030777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.74671971797943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.5377595424652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.49568057060242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 211.00928008556366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.43535923957825, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.0980784893036, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 205.11056005954742, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.88384091854095, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.31887888908386, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 212.02159881591797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.9027200937271, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.79312086105347, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 209.25599932670593, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.1166399717331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.47167909145355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.49951922893524, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.47871923446655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.02896010875702, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.77856063842773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.92063987255096, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.3062402009964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.55487871170044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.12224090099335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.82448017597198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 209.62159872055054, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.2233612537384, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.0076801776886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.44864201545715, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.7331190109253, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.0635199546814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.725279211998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.3529658317566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.99152100086212, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.4566388130188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.22880148887634, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.69360053539276, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.85759973526, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.20048010349274, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.20736038684845, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.79168117046356, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.96319949626923, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.79311990737915, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.34767985343933, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.6791990995407, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.53760063648224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.03488051891327, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.14975833892822, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.9319999217987, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.62143921852112, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.70431971549988, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 212.20144152641296, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.51007986068726, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.04687929153442, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.49151968955994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.13551926612854, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.64543986320496, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.1158413887024, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.14223992824554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.34960162639618, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.4664009809494, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.4425595998764, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.08463847637177, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.81535875797272, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.87183952331543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.7304002046585, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.3415995836258, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.64767956733704, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.73680138587952, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 213.74927937984467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.4323160648346, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.87696170806885, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.11264038085938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.46831893920898, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 196.74911975860596, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 212.60720074176788, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.73920142650604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.6107213497162, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 229.1430377960205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 226.0867202281952, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.46607875823975, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 216.69024109840393, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.80512046813965, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 233.33600163459778, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 231.73727869987488, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 229.18943762779236, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.16432082653046, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 212.0748782157898, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.86415767669678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.87392020225525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.79472136497498, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 230.30336260795593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.5622384548187, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.5008008480072, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.1742386817932, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.405118227005, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.7708809375763, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 300.2516806125641, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 292.02272057533264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 297.50223755836487, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.60176002979279, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.60128045082092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.7908810377121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.66800010204315, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.74912095069885, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.0027197599411, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.6252804994583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.21456038951874, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.34960079193115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.7959998846054, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.76576030254364, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.8563185930252, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.71999967098236, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.314399600029, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.04816055297852, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.10815978050232, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.78799974918365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.16352033615112, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.9838389158249, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.23151993751526, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.89567971229553, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.15759921073914, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.25039982795715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.20527935028076, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.81984090805054, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.04127979278564, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.13072037696838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.015199303627, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.47120141983032, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.4281586408615, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.71311902999878, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.7104001045227, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.92512094974518, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.34607899188995, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 227.73215889930725, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.33023929595947, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.14239859580994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.5110386610031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.7659239768982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.1804802417755, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 205.0601589679718, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.49696099758148, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.99072408676147, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.093279838562, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.10463917255402, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.28672075271606, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.5974419116974, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.06976056098938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.73232126235962, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.281919836998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.064000248909, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.10015964508057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.7222397327423, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.23727977275848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.35791981220245, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.44047915935516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.52815961837769, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.59343934059143, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.45616137981415, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.28223931789398, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.36703991889954, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.74832117557526, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.2865605354309, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.46239984035492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.4827196598053, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.3974405527115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.3734403848648, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.63552105426788, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.650399684906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.8736013174057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.9953602552414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.8182407617569, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.12192142009735, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.96160101890564, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.69151973724365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.7414401769638, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.54399859905243, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.46752035617828, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.10336124897003, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.71616005897522, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 227.779198884964, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 207.62336134910583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.58287930488586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.46415948867798, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 227.30000138282776, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 205.49311876296997, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.69855880737305, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 204.8859190940857, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.0721607208252, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.7169587612152, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.03184056282043, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.80335879325867, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.05072236061096, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.18207812309265, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.00400233268738, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.38623976707458, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.5219204425812, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 277.09375977516174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.77520370483398, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.65999960899353, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 292.14239954948425, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.40320086479187, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.74687957763672, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.80896162986755, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 287.74927973747253, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.74896335601807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.5024013519287, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.53792142868042, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.2574405670166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.5294370651245, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.6815996170044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.3460793495178, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.96991991996765, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 278.9619183540344, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.34991979599, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.07568192481995, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.84912037849426, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.0611209869385, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.1238396167755, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.45407891273496, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.4670422077179, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 341.28031969070435, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.1727979183197, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 339.3601596355438, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.49743843078613, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 337.02688217163086, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.79840064048767, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 334.4364798069, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 204.90207970142365, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.42191970348358, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.72048115730286, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.3379204273224, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.38480019569397, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.08336114883423, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 205.77903747558594, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.92095935344696, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.41392064094543, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.01472079753876, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.48048055171967, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.8647998571396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 203.16927909851074, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.9374407529831, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.26367938518524, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.61232113838196, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.0217628479004, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.95680105686188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 203.39184165000916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 200.48672378063202, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.26671886444092, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.8686408996582, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.29791975021362, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.61423909664154, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.8993618488312, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.57088112831116, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.0646402835846, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.96399927139282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.3617603778839, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.1057629585266, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.80623984336853, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.5449616909027, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.28208136558533, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.99296081066132, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.6732804775238, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.4488000869751, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.94768166542053, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.64687943458557, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 209.83007788658142, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.3123208284378, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.6771218776703, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.56256079673767, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 209.6016013622284, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.4497607946396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.54175853729248, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 185.20048022270203, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 209.4760024547577, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.41823995113373, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1046.9865655899048, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.89664149284363, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1039.7495985031128, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.25871801376343, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1048.6976099014282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.28000116348267, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1049.9424028396606, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.4414393901825, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.833432674408, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.8878357410431, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.4657554626465, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.32623958587646, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.890079498291, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.58991980552673, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.1241636276245, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.22288155555725, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.0121564865112, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.59711813926697, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.8275213241577, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 399.49488282203674, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.8575963973999, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.46767950057983, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.6048030853271, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 402.61215806007385, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.027045249939, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.73151993751526, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.4056024551392, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.46607971191406, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.9331202507019, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.94943833351135, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.6887955665588, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.10223841667175, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2004.9923133850095, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 280.5076801776886, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2004.9726390838623, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 279.371680021286, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2009.775676727295, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 277.35008120536804, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2001.6785526275632, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.7851207256317, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.5899221897125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.41791915893555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.3571183681488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.5233588218689, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.6763210296631, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.85487961769104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.32143998146057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.7729594707489, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.96288204193115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.6679992675781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.2633628845215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.765442609787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.894079208374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.8280007839203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.6251208782196, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.41264390945435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.40736150741577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.8375999927521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.15296244621277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 349.55039858818054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.0814392566681, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.0839982032776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.1713614463806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.5841598510742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.0174403190613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.9780797958374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.2766389846802, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.79936051368713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.05168080329895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.54928064346313, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.0457592010498, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.09791898727417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.1951994895935, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.7022387981415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.80336117744446, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.1308796405792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.82352089881897, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.44288086891174, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.932000875473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.73775911331177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.8291189670563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.7942407131195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.31103920936584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.4702398777008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.67232060432434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.1244761943817, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.04992294311523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.802237033844, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 344.79056000709534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.85151982307434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.01856303215027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.92719650268555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.5420799255371, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.1031982898712, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.7860805988312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 391.2289583683014, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.497918844223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.41663932800293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.31024146080017, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.9520003795624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.7297646999359, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.302401304245, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.9635193347931, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.87743949890137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.0740761756897, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.2872009277344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.44592332839966, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.9347233772278, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.2379174232483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.87295484542847, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8046350479126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9209585189819, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.44351983070374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.25920033454895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.3529648780822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.91695976257324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 425.6774389743805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.61424112319946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.1743965148926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.7612724304199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.32080006599426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.9553575515747, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.55631923675537, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2961602210999, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 421.61983847618103, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.2192015647888, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6585645675659, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.4771203994751, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.9540753364563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 397.08863854408264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.99551916122437, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.04512166976934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 414.6875214576721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 420.44528007507324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5196838378906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.514726638794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.43135929107666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 339.58848118782043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.0415999889374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.9095993041992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.9635200500488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.93136048316956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.6720037460327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.2540822029114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.1446385383606, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.32384276390076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.43264079093933, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.9241580963135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.94656109809875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.6916787624359, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.65856170654297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.6825602054596, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.0950403213501, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 341.18223786354065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.48560094833374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.95504117012024, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.0113613605499, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.9260823726654, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.3894410133362, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.22528100013733, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.30623841285706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 341.11727833747864, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.6046419143677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.47183990478516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.12752079963684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.4619174003601, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.64095997810364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.90176010131836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.80735778808594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 344.86608266830444, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.4193594455719, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.2404816150665, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 349.7056007385254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.04111981391907, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.0017580986023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.52351927757263, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.01024055480957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 349.2580807209015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.84815788269043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.32255959510803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.61375856399536, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.4623987674713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.37487840652466, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.48256158828735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.93232226371765, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.7238392829895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.56447982788086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.5804777145386, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.865761756897, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.7179214954376, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.51279640197754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.67648005485535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.8604781627655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.3808009624481, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.20687770843506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.6556794643402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.6340775489807, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.7652778625488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.4944031238556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.5984010696411, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.55359721183777, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.8644742965698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.25072288513184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.88320446014404, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.0764811038971, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.0502419471741, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.0169606208801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.14671874046326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 397.8484785556793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.98735666275024, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.6444787979126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 413.2750380039215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.47216176986694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.9644775390625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.2780821323395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 412.89552330970764, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.72896122932434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.0860800743103, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.5313606262207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.76383924484253, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.6635229587555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.0982406139374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.207679271698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.72832131385803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.8540816307068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.13647985458374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.7230398654938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.81967878341675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.9747188091278, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.5580792427063, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.17903995513916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.0478403568268, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.17711877822876, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.62160062789917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.1153597831726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.8246397972107, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.45887994766235, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.90367913246155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.9979181289673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.45792174339294, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.8243193626404, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.0964787006378, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.67455887794495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.9927999973297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 341.6254389286041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.0527992248535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.40320205688477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.9851191043854, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.60816383361816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.03568053245544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.6684787273407, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.78528237342834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.65312099456787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.7436797618866, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.4284813404083, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 379.1543996334076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.04896235466003, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.99823808670044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.2345595359802, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.42816138267517, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.43200159072876, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.5271985530853, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.38592052459717, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.9892797470093, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.82623839378357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.7487995624542, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.9412808418274, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.19039821624756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.2830390930176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.2203199863434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.511198759079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.1075220108032, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.1379237174988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 417.2507178783417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.6563243865967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 416.0483229160309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.801598072052, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 419.39695596694946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.7751998901367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 416.7587184906006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.5607979297638, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.439838886261, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.6131205558777, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.41167879104614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.688481092453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.7660789489746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.9688003063202, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.31936287879944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.59983706474304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.0332806110382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.2094385623932, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.1113615036011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.68111872673035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.59967947006226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.39983916282654, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.1115207672119, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.2902412414551, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.9166407585144, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.65312099456787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.84272170066833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.77775979042053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.8860788345337, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.5297598838806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.07247829437256, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.2803225517273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.4257607460022, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 422.7028822898865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 411.20911955833435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 424.1646361351013, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 403.1371212005615, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.1009578704834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 397.014080286026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.9987196922302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.6006398200989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.38112235069275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.65184020996094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 418.2320022583008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.1993598937988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.6611201763153, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.2161555290222, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.8483204841614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.9476807117462, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.3263998031616, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.547518491745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.70048213005066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.4912037849426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.42352056503296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.0630407333374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.2523195743561, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.7355201244354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.9225609302521, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.8668808937073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.93343901634216, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.28959941864014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 388.19488048553467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 388.08544278144836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.1404821872711, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.07983922958374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.0779182910919, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.40111804008484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 381.7400002479553, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.19904088974, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.0273609161377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.0462417602539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.73519921302795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.0271999835968, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.5371198654175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.8923192024231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.2135977745056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.08367919921875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.34271788597107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.241916179657, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.4007980823517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.72255992889404, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.2268810272217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.80208444595337, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 402.5428819656372, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.8991997241974, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.215039730072, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.2887969017029, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.5060818195343, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.52143955230713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.6553599834442, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.00783801078796, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 400.23695826530457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.11504340171814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.5068814754486, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.24592113494873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.6883237361908, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.9115207195282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.4185588359833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.7844805717468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.15007758140564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.24959993362427, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.43903732299805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.41440057754517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.4249610900879, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.7996826171875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.9969639778137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.81311559677124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7753615379333, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.9249625205994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.10383892059326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.3755221366882, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.4019150733948, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.260968208313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.7377586364746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.5563173294067, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.3742370605469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.6419191360474, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.2289552688599, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.3734402656555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.0355205535889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.8134393692017, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.6851134300232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.3137617111206, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.0582404136658, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.7771203517914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.0260806083679, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.2487988471985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 379.76768016815186, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.4931173324585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.58335995674133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.679678440094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.4758417606354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.73792028427124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.57776141166687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.63135838508606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.1825575828552, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.11215901374817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.2548773288727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.073118686676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 394.5423996448517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.81087851524353, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.5443186759949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.0348825454712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 394.0619206428528, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.4127984046936, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.879998922348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.7588815689087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 400.3596806526184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.56176352500916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.20928263664246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.5992012023926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.40607810020447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.5083222389221, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.50928235054016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.85007905960083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.11424136161804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.7977600097656, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.62751722335815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.4003210067749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.8984007835388, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.5193591117859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.09023809432983, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 402.92192339897156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.5527982711792, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.27455735206604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 381.62495970726013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.2089583873749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.3169593811035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.44816279411316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.5252802371979, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.53264117240906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.8889584541321, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.59439992904663, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.7447998523712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.9720034599304, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.84640192985535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.8728015422821, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.87167477607727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 397.1996808052063, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.19983768463135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.58976125717163, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.97712087631226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.99295687675476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.16992020606995, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.27711820602417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 388.1816029548645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 403.5488021373749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 381.84383749961853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.83935832977295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.4552011489868, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.722718000412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.60400581359863, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.3881583213806, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.2761559486389, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.42463731765747, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.9806408882141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.4912042617798, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.49087476730347, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.0980851650238, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.39424085617065, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.8846406936646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.7939205169678, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.7867183685303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.02688121795654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.5734438896179, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.86848306655884, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.5278356075287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.7054388523102, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.00784373283386, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.3088004589081, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.62479877471924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.85263562202454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 379.19536113739014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.7564797401428, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.08079981803894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.53103971481323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.3243179321289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.4099187850952, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.8193590641022, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.1049563884735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.12287974357605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.2011194229126, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.7438397407532, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.313600063324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.6476786136627, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.011682510376, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.8743999004364, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 403.64431858062744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.81647849082947, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.33088088035583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.92848014831543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.68943977355957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.0681571960449, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.2251205444336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.4359998703003, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 410.67471742630005, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.6622385978699, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.65808057785034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.219521522522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.8302412033081, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.39375948905945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 418.1006383895874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.7520024776459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.27135276794434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.8844804763794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 425.2235162258148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.9435176849365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.2632019519806, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.34111857414246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 427.8235173225403, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.71247720718384, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 400.7270383834839, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 391.0100769996643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 428.24560165405273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.6801574230194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.2201633453369, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.756959438324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 785.9078407287598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.2998352050781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 773.4543943405151, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.35072088241577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 782.4633526802063, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.55023884773254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.28512144088745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 401.4401614665985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 391.83664202690125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.3164849281311, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.30783796310425, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.61056089401245, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.44704246520996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.1374409198761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.653920173645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.7168028354645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.89503931999207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.4851200580597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.1511986255646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 402.015997171402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.6796820163727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.1756854057312, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.6379179954529, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.1532790660858, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.89264130592346, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.28080463409424, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 391.7915213108063, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.0246386528015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.45679998397827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.09855461120605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.7404832839966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 427.10432052612305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.0264048576355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.5956811904907, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.1567983627319, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 416.99999809265137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.6719994544983, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 422.7384042739868, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 413.5652816295624, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.6907217502594, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.21552205085754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.27584409713745, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 431.3356876373291, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.71440410614014, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.39040207862854, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.7235198020935, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.8988780975342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.2407991886139, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.12719988822937, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.5287938117981, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.84703731536865, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.74496126174927, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 409.3963158130646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.32032203674316, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 418.19632291793823, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.04255747795105, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.824960231781, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.679847240448, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.15071964263916, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 388.77920031547546, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.0116775035858, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.348002910614, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 413.7985599040985, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.18127727508545, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.6857604980469, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.0094413757324, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.58800172805786, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.2455987930298, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.30256032943726, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.34704065322876, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.5243215560913, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.8396773338318, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.1598482131958, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.4027171134949, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1940841674805, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.6007943153381, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.8694396018982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.1163239479065, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.8313660621643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.6585597991943, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.717116355896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.5123205184937, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.64112520217896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.83039999008184, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.3241629600525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3905620574951, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.8428783416748, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.1305575370789, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.13823795318604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.60111999511713, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.55247831344604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.7777619361877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.4174389839172, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.3292832374573, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.331356048584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.359842300415, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.4406394958496, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.181435585022, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 686.800799369812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.3745579719543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.4708819389343, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.721284866333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.24911761283875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.1425585746765, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.341760635376, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.20928168296814, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.83712220191956, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.58672165870667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.88511657714844, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.49280071258545, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.1388795375824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.2590401172638, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.95647978782654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 379.3675231933594, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.71935749053955, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.8390402793884, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.14048051834106, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.55919790267944, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.6742398738861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.6772794723511, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.2427203655243, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.82736015319824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.0296006202698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.3807978630066, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.18912267684937, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.3033585548401, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.3095989227295, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.48624062538147, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.5376012325287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.80928087234497, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.2969596385956, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.59616136550903, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 394.25456166267395, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.13376331329346, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.06367921829224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 416.65743827819824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.2703976631165, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 410.9447991847992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.2446389198303, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 421.3369596004486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.409761428833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 413.63168001174927, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.2580828666687, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 415.29184341430664, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8752021789551, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 414.4822382926941, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.02303981781006, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 421.29440784454346, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.1267223358154, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 412.3483216762543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.3872013092041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.96912026405334, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.2875213623047, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 407.02927470207214, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.87807965278625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.50127840042114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 394.9934387207031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.57168340682983, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.295681476593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.59535121917725, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.4092798233032, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.00784134864807, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.4246428012848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.40368127822876, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.07056045532227, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.6971187591553, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.86463952064514, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.40416073799133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.06096363067627, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.21984028816223, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.0948803424835, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 406.5184020996094, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.0825581550598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.2387239933014, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.95408487319946, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 424.2969584465027, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 431.90751791000366, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 426.376314163208, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 429.4601607322693, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 427.5508785247803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.93439769744873, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.7883222103119, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.1495933532715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.93279933929443, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.0990381240845, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.5747184753418, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.3980793952942, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.6902446746826, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.1707158088684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.9801559448242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.3433632850647, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.4752011299133, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.4640030860901, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.419683933258, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 685.7265591621399, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.6284809112549, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.9652786254883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.1249613761902, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.6464033126831, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.178081035614, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.9079976081848, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.0596823692322, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.9752039909363, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 637.0489621162415, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.6879992485046, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.2524814605713, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.7729578018188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.2892823219299, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.4974374771118, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.0148782730103, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 676.7872071266174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.55504322052, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.8300805091858, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.6980786323547, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.3321633338928, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 650.3438329696655, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.3699131011963, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.726719379425, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.2940802574158, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.5793724060059, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.8817591667175, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.19952917099, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 816.7271971702576, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 804.858386516571, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.522566318512, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 784.1417622566223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 815.1971197128296, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 789.3760061264038, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.2936053276062, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.3564829826355, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.9671995639801, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.3204846382141, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 425.29152154922485, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.5383915901184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.8899235725403, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 410.2552008628845, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2796783447265, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.8385579586029, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.46512031555176, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.05856132507324, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.22304010391235, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 417.2472023963928, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.56783390045166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.4348797798157, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 428.0430340766907, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.62896156311035, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.68944025039673, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 413.3393609523773, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.1817612648011, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.4132823944092, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 425.84911823272705, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.95551919937134, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.0324811935425, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.69728088378906, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.13840675354, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.4559993743896, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.6900806427002, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.984959602356, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.1736001968384, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.9135999679565, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.827356338501, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.0371255874634, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.9036831855774, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.5608024597168, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.86783313751215, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 400.36367893218994, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.0955200195312, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.97567653656006, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.4403190612793, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.0486397743225, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.8708834648132, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.81536054611206, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8788771629333, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 401.7859184741974, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.4353585243225, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.0551996231079, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3358335494995, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 403.38640093803406, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2157.370252609253, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.2390398979187, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2155.1417446136475, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.4395213127136, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2131.9035243988037, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.1879997253418, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2130.774555206299, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.282399892807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1347.037591934204, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.356481552124, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1332.7947187423706, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.4536037445068, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1370.317120552063, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 958.6412858963013, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1347.9142379760742, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 980.7564830780029, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1312.3680114746094, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.7958307266235, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1353.0207967758179, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.7055912017822, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1315.1753616333008, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 922.0108890533447, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1355.9905529022217, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 957.0036745071411, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1369.8750305175781, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.3732786178589, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1363.5646343231201, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.1299209594727, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1358.3686447143555, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.3976020812988, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1360.46639919281, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.8870277404785, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5400.282554626465, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.2171168327332, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5521.018867492676, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.3126459121704, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5568.6542320251465, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.9088039398193, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5514.854431152344, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.6579208374023, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.400643825531, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.9092836380005, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.1172823905945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.7408003807068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.987361907959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.8251152038574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.79264307022095, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.63056087493896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.01664304733276, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.4689564704895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.8795237541199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.2563228607178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.0180835723877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.7279987335205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.87760066986084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.448956489563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.9009623527527, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.80447816848755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.23360204696655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.8470411300659, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8371243476868, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.03808307647705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.84784173965454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.77503633499146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.5131254196167, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.0177607536316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.0947184562683, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.44032096862793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7126398086548, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.16592264175415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.15376329421997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.04224395751953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.3411202430725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.4599928855896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.8124785423279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.7155222892761, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.070237159729, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.8988814353943, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.5513606071472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6244759559631, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.0236768722534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.5019235610962, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.53408002853394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.3302412033081, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.88192272186285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.9823932647705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.9681644439697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.3988819122314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.43344354629517, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.79311895370483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.5998339653015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.5876798629761, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.80431890487677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.8788814544678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.5124807357788, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.3894371986389, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.73055839538574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.4707221984863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.3148784637451, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.62432050704956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.1174383163453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.94544076919556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.05744314193726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.1419253349304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.6916847229004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.0177612304688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.5499229431152, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.2756838798523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.9065656661987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.7987236976624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.3843207359314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.281277179718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3382396697998, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5774435997009, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.9612793922424, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.6553583145142, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.6577596664429, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.1633577346802, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.129280090332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.0787229537964, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.3304018974304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.0911989212036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.7529635429382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.34832239151, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.1279997825623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.7484831809998, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.6841607093811, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 721.8694376945496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0521626472473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.4187164306641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 683.6155223846436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.5699162483215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.3118405342102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4272003173828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.6577596664429, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 720.1807975769043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.03855419158936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.89216136932373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.84208631515503, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.059841632843, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.6257586479187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.1444821357727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.69184160232544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.1319971084595, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.5099182128906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.04576206207275, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.6878423690796, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.067675113678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.4124798774719, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.1865644454956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.3537669181824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.47711992263794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.89551877975464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.04719400405884, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.06415700912476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.33039712905884, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.64367628097534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.8473539352417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.3446407318115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.8161563873291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.1214408874511, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.36927366256714, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.65040016174316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.42336320877075, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.2849626541138, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.8023986816406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.82015228271484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.87744092941284, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.6052794456482, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.22704553604126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.3409605026245, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.8120031356812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.60399770736694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.67215728759766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.9139189720154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.48047971725464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.4555196762085, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.9766402244568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.34255933761597, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.4798374176025, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.7265558242798, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.6156826019287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.45248222351074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.78463840484625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.96255922317505, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.70303201675415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.1379227638244, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.8209643363952, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.6668839454651, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.48415660858154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.90943670272827, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.9489541053772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.59663677215576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.06143522262573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.68864011764526, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.21823501586914, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.38223695755005, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.2768020629883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.8963165283203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.07456398010254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.8452806472778, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.0979194641113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.28928136825556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.133120059967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.8363146781921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.1732873916626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9248013496399, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.8006434440613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.1726393699646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.2156763076782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.7091193199158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.4611191749573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.8033547401428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.7782368659973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.08592033386236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.2017631530762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.63040351867676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.1135993003845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.91632080078125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.9867219924927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8675165176392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.7299189567566, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.1996831893921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.69919776916504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.27152395248413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.8038401603699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.63552141189575, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.6734414100647, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.7776007652283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.88143730163574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.33904218673706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.5924777984619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.23855781555176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.999520778656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.98703956604004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.47151803970337, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3892774581909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.0939211845398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.9155201911926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.56912183761597, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.78975439071655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.471200466156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.0088028907776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.9056005477905, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3862357139587, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.6009564399719, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.36463832855225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.01903533935547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.49152135849, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.8369626998901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.1068820953369, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.40688276290894, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.7883176803589, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.02847623825073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.8704047203064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.90239763259893, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.94143772125244, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.30656147003174, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.0460777282715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.05200147628784, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.87312173843384, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.0803198814392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.1115174293518, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.60016107559204, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.1436758041382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.8487935066223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.4116792678833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3166465759278, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.97376585006714, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.86064529418945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.5206422805786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.1236791610718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 847.0975923538208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.7028818130493, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.7065601348877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.457437992096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.2985725402832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.5515203475952, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 862.2068881988525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.4817633628845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.8278388977051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.0052766799927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.29439878463745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.99135637283325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.1006426811218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.05664682388306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.82144021987915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.1884789466858, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.77232217788696, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.1883158683777, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.51343965530396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.58128023147583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.4750409126282, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.49439811706543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.6110324859619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.5860815048218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.14816093444824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.30335521698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.56463956832886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.43007802963257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.52992391586304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.34608030319214, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.78256273269653, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.4327988624573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7833614349365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.467041015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.3667178153992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.6975932121277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.4921598434448, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.59343671798706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8545589447021, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.39695978164673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.0692811012268, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.6680030822754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.31728315353394, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.3214421272278, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.8332777023315, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.97855615615845, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.2771215438843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.5403218269348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.5910358428955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.42703914642334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.22720193862915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.9544024467468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.580002784729, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.2416038513183, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.1864042282105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.44112205505377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.9291248321533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.1135983467102, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.61759901046753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.0438389778137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7072019577026, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.9804797172547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.4688024520874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.5473628044128, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.5652766227722, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.69215965270996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.3788814544678, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.264639377594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.4707236289978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.0300760269165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.7937636375427, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4462366104126, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.4332814216614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.5544023513794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.88447237014765, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.3735918998718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.3686370849609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.4799971580506, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4604802131653, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.2886424064636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.9265565872192, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.84047842025757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.41616058349615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.1624002456665, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.0084772109985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.6414394378662, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.529757976532, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.2883205413818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.0571188926697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.7320022583008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.75535821914673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.4038400650024, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.0808029174805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.3692841529847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.6886386871338, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.9918508529663, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.3352003097534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.9107098579407, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.41984033584595, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.1582398414612, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.6043210029602, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.99727630615234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7839941978455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.3747200965881, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.9831981658936, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 783.0198454856873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 814.6391940116882, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.4806361198425, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.4558424949646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.9104022979736, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.9678425788879, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 780.861759185791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 806.8563151359558, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.1468782424927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.4620785713196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.3959937095642, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.197114944458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 787.6835203170776, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 806.4433646202087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.4460830688477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.0347213745117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.1579179763794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.5918412208557, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.4144034385681, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.6856060028076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.6097555160522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.811520576477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.6844806671143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.62672185897827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.2084822654724, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.14895725250244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.05984258651733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.92447328567505, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.32592010498047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.6969618797302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.43584299087524, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.91135931015015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.64559507369995, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.299204826355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.5310368537903, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.66847944259644, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.0955286026001, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.6545567512512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.204158782959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.90400314331055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.0683250427246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.7543969154358, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.7822437286377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.02527952194214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.6473593711853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.94944429397583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.0964789390564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.2278437614441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.7108874320984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.2971200942993, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.6843204498291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.5854392051697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.7180790901184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.2276773452759, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.8510413169861, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2492828369141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.9841604232788, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.8430423736572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.4569606781006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8008027076721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.5592007637024, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.9675207138062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.9864010810852, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9407949447632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.0883193016052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.7487988471985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.992636680603, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.231999874115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.40223836898804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.6191935539246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.2566418647766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.6510338783264, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.30831384658813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.7609601020813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.3787207603455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.8862390518188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.03696393966675, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.1632013320923, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.3855967521667, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.2307195663452, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.70000314712524, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.160481929779, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.5875129699707, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.5564832687378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.75312185287476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0271978378296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.7043237686157, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.6507205963135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 731.5787196159363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.5057654380798, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.6606373786926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.7057681083679, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.0148854255676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.7852735519409, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.0175986289978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.038402557373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.7651171684265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.5705647468567, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.8459167480469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.449761390686, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 731.1942386627197, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.9235215187073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.4508762359619, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.722243309021, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.77120304107666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.40847873687744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.56368112564087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.2615976333618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.3188810348511, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.61120414733887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.4851245880127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.5836796760559, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.206392288208, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.9520010948181, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.75119638442993, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.2505569458008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.92240715026855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.32512187957764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.55615615844727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.8715214729309, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.6545624732971, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.07456731796265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.14319849014277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.6220769882202, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.56127405166626, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.78272008895874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.9639992713928, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.707200050354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.2280068397522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.41199588775635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.3900828361511, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.04543685913086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.68640184402466, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.48640060424805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.52000093460083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.10048484802246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7534456253052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2929553985596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0417609214782, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.243200302124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.2686376571655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.285279750824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.08895874023443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.1867184638977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.8993611335754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1977596282959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4747180938721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.81679582595825, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.6769595146179, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9185547828674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9702391624451, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1036.7870473861694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.5051140785217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1046.2390422821045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.1932787895203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1046.6519975662231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.6742415428162, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1044.543514251709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.4646401405334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.842565536499, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.3147196769714, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.1494421958923, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.9870395660401, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.5614376068115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.566556930542, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.1870393753052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.3379240036011, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.54752016067505, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3619117736816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.29087686538696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.52160024642944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.10031938552856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.83615922927856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.50399875640875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2008056640625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.0345616340637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.93583965301514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.6608033180237, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.1430411338806, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.7654371261596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.9604845046997, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.7700810432434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.4092745780945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.1110486984253, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5150375366211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.0920014381409, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.6753582954407, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.5708770751953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.36879730224604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.6083173751831, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.5787253379822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.9463958740234, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.89552116394043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3484778404236, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.5164813995361, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.8527979850769, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.0395221710205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.773115158081, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.9414367675781, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.9071979522705, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.21087598800665, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.3262405395508, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.1156787872314, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.7318396568298, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.7003231048584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.7531261444092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.2833642959595, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.9310350418091, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.28383922576904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.6716771125793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.9460773468018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.1279988288879, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.6995258331299, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.0324811935425, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.5681557655334, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.7276787757874, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.55279636383057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5289583206177, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.6558408737183, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.7048015594482, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.1683168411255, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.1206426620483, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.4311962127686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.0340805053711, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.0395178794861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.4912009239197, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.6211194992065, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.8147192001343, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.0406403541565, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.2803211212158, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.9638409614563, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.4401597976685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.8587193489075, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.3001618385315, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.1924767494202, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.8880033493042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.6904010772705, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.6627159118652, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.3420805931091, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.2684845924377, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.919517993927, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.5945582389832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.7884798049927, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.8612775802612, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.513120174408, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.4740858078003, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.9084801673889, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 909.161434173584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 929.0564870834351, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 918.5683155059814, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.5052795410156, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 920.7291269302368, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.9278326034546, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 918.1777667999268, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 937.9136037826538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.9291195869446, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.4052758216858, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.24128055572515, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.25631999969477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.0483202934265, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.82959604263306, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7376008033752, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.1316819190979, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3908772468567, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.2968006134033, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4148797988892, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.46095943450933, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.2516841888428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.6539192199707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.5595216751099, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.4401602745056, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.4579219818115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.21792125701904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.1166429519653, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1307225227356, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.6961603164673, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.3313570022583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.8124785423279, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9503984451294, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.8972768783569, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.65936040878296, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.0369591712952, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.5923271179199, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.4888033866882, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.776801109314, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.0369563102722, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.0564775466919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.1297578811646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.0991978645325, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.4423990249634, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.1862421035767, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.2233581542969, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.0364861488342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.5759973526001, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.8592066764832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.3948802947998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.9006423950195, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.9043231010437, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.0724849700928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 637.7334475517273, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.29407787323, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.2705583572388, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.9001603126526, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.31055784225464, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.080641746521, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.94592094421387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.3630361557007, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.5889620780945, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.53664445877075, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.438401222229, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.32239818573, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.243679523468, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.7697548866272, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.72816133499146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.63711881637573, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.7100830078125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.54943990707403, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.1915216445923, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.08943939208984, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.40735673904425, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.4511995315552, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.85279655456543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.65903949737543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.3120036125183, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.11631441116333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.84496259689325, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.3870391845703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.9044780731201, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.9284749031067, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.5910396575928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.5049619674683, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.3652806282043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.4571189880371, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.682240486145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.1683211326599, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.362557888031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.2540826797486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.7601580619812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.8334436416626, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.5731191635132, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9656019210815, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.2577614784241, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.7577600479126, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 919.0977478027344, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.7552013397217, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 756.3060688972473, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.7388820648193, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 921.1959886550903, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.1860752105713, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.2121620178223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 784.9721670150757, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 912.193922996521, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 881.9087982177734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.1528024673462, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.8446373939514, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.306568145752, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 863.3812856674194, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.2852802276611, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.1214380264282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 919.5694446563721, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 888.2473659515381, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.6396775245667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.0584011077881, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 919.3343925476074, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 862.6158475875854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 769.6737670898438, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.6185617446899, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 916.5529584884644, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 879.6939182281494, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.3945579528809, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 771.7225646972656, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.3254356384277, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.9913539886475, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.3937511444092, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.101282119751, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1107.701120376587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.808313369751, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1097.3603105545044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1069.1529560089111, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1097.2671937942505, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1072.7227306365967, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1101.8201541900635, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1068.4824085235596, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.5830397605896, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.7286353111267, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.1070394515991, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.9907169342041, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.9278378486633, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.5656037330627, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.0745630264282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.6241598129272, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.8638386726379, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.1241602897644, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.124321937561, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.6028747558594, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.0137577056885, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.2347211837769, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.9143986701965, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.2072019577026, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.6604838371277, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4764742851257, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.7036843299866, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.5407962799072, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.6921577453613, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.0086326599121, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.0710425376892, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.3737664222717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.4644765853882, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.940477848053, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.0625600814819, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.7523212432861, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 859.0662384033203, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.3638439178467, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 860.406084060669, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.2515215873718, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.1555190086365, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.5910339355469, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 684.4694375991821, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.677282333374, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.7108812332153, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.3068833351135, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.3060822486877, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.1395201683044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.2188830375671, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.4937605857849, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 685.5468845367432, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.8824005126953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.9116764068604, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.9828796386719, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.3017597198486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.6291246414185, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2439.718551635742, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.991039276123, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2439.4761657714844, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.2193632125854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2454.920015335083, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.6356792449951, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2445.2486419677734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.0816016197205, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1823.936471939087, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1275.231523513794, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1822.3835182189941, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1290.6222486495972, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1830.7166481018066, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1270.856966972351, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1848.1737613677979, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1281.9995260238647, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1836.9475078582764, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1274.169602394104, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1849.0423965454102, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1275.0219249725342, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1833.1099224090576, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1269.8171138763428, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1852.1881675720215, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1290.0630378723145, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1864.9940872192383, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.131685256958, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1870.2415943145752, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1160.5334424972534, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1865.4993438720703, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.476643562317, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1875.291519165039, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1149.147367477417, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7354.446449279785, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 950.5054426193237, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7384.832649230957, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.2988796234131, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7363.726768493652, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.7424144744873, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7391.756782531738, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 941.5540885925293, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.32016372680664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.11040592193604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.0401611328125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.5196776390076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.8401570320129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.84799814224243, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.2270412445068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.9315228462219, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.3454418182373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.49183893203735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.6057605743408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.7560033798217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2574367523193, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5294451713562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.8528022766113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5372838973999, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.74928188323975, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.9041647911072, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.9552035331726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.1254391670227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.1179265975952, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.2208008766175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.5699214935303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2286367416382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.62671661376953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.0147213935852, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.85567903518677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.6723184585571, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9484839439392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1278438568115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.3892812728882, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.188485622406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.2982397079468, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.54959869384766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.17919969558716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.74176311492926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.5380735397339, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.51984167098993, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8553576469421, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.9267230033875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.1755218505859, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.81024217605585, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.7123236656189, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.875039100647, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2884831428528, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.2454414367675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1627202033997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.4183979034424, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.1575999259949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.1905603408814, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.01471614837646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9251136779785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6238441467285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5215997695923, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9964814186096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.9159989356995, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.35424184799194, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.0009579658509, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.7408013343811, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4070358276367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9128007888794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.6710343360901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.1092796325684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.8747172355652, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2273578643799, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.5648007392883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.8588833808899, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.5652747154236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.8296022415161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.0372776985168, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.475513458252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.2584013938904, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.0452828407288, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.3939199447632, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.0787215232849, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 721.7812728881836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.9521579742432, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.477276802063, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.2215991020203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.292317867279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.0625629425049, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.0491199493408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.8100819587708, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 724.018075466156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.4343996047974, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.538562297821, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.6363186836243, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.1576051712036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.7463998794556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1646418571472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.2003178596497, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.731041431427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.8990359306335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.5300765037537, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.1078395843506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.646402835846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.4187211990356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.0796785354614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.3464002609253, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.09152030944824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.38496589660645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.57263469696045, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.72128200531006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.10447835922247, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5862398147583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.30191564559937, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.07008266448975, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.86767768859863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.28704071044916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.4027199745178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.2590432167053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.43952369689936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1942372322083, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.5025601387024, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.2945647239685, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2728023529053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3055996894836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.20784187316895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.1124801635742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.3400011062622, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.5700721740723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.28768253326416, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.0075206756592, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.4697632789612, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.1078481674194, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.4284815788269, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.7743968963623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.79567813873297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.62351751327515, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.98047828674316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.0241584777832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2131261825562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.8041577339172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.40640163421637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.745762348175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.9233565330505, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.83440113067627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.34048080444336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.1209607124329, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.0296006202698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.07760620117193, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.8078370094299, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1799969673157, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.5780844688416, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.21727657318115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.5143957138062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6316738128662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9318385124207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.58543443679804, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.25919723510737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2894358634949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6953601837158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.1419196128845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.1596798896789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9310331344604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.1428799629211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.0884785652161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.76831674575806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.9745635986328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.4521622657776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.3798413276672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.4083228111267, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.2011179924011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.2931213378906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.1755175590515, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.6528024673462, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.286075592041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.6196784973145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.4230456352234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.793598651886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.9734449386597, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.7540812492371, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.3628854751587, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.6414403915405, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.1048035621643, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.8438353538513, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.27327823638916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.58848381042475, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.82640409469604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.06848764419556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9545564651489, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.38623762130743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.4519987106323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.9772815704345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.33359813690186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.16911983489985, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.24559783935547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.1257562637329, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.2673554420471, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.1748833656311, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.07024002075195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.60240077972406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.18624353408813, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.2841544151306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.4100818634033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.8857564926147, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9220790863037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.7843179702759, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.0251159667969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1465630531311, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.9468765258789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.80992221832275, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.3422431945801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3614430427551, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.9355239868164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.67568159103394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.02335500717163, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.37167978286743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.1481552124023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.0865626335144, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.0979218482971, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.99455881118774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.10431814193726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.04416322708136, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.1798396110535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5248045921326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2113628387451, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.9947166442871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.09519815444946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.31343841552734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.5028753280639, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.053918838501, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8446383476257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.26303911209106, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.85887575149536, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.3568015098572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.66543912887573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5147185325623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.9619235992432, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.60015535354614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.1684837341309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0785608291626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.3945598602295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.772322177887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.2844886779785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.1121644973755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.3243217468262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.2979230880737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 879.0352010726929, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.9257636070251, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.3798432350158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.85663414001465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2291193008422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.6958384513855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.5795202255249, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.3785557746887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.8054389953613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.8051190376282, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.4955177307129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.29247665405273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.2675223350525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.938720703125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.7769618034363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.69119930267334, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9878416061402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.5391993522644, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.86832189559937, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.4361548423767, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.030080318451, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.3260769844056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.4603247642517, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.88207721710205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.8081603050232, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.141122341156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.9615979194641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.59583663940435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.9275231361389, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5115184783936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.274405002594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.66303873062134, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.7772827148438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.26784181594854, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.76816272735596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.67615461349493, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.55504560470575, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0500841140747, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.5083198547363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.53760051727295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.6824059486389, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.8337631225586, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.7332787513733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.7766456604004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.809280872345, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.06032085418707, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.8980784416199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5847959518433, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.1627125740051, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6747250556946, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.2511959075928, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.84575748443604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.9225640296936, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.25856161117554, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.6576051712036, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4409627914429, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.0369558334351, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2860794067383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.2347197532654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.94143724441534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.21647977828974, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.85231637954706, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.7920064926147, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.40848159790045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.9547171592712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.7339172363281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1564769744873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.6196751594544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.6675143241882, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.7817568778992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.7488050460815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7116847038269, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.1838459968567, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 663.4439992904663, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2912030220032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.82543420791626, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8752059936523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 637.3879957199097, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.2328033447266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.0777540206909, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.4710426330566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.1694407463074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8108806610107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1764750480652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.8641562461853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.2876825332642, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.4054408073425, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.2408022880554, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.1108884811401, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.8824014663696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.7543940544128, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.4923195838928, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.731360912323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.9542369842529, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.3523230552673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.0388770103455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.1299214363098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.7744045257568, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.7681603431702, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.3956661224365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 841.4571237564087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.0318369865417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.1212739944458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.8830389976501, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.6016001701355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.8598299026489, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.2447996139526, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.7948808670044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.3683214187622, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.7462377548218, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.1515188217163, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 814.9081563949585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 842.8244781494141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.7712001800537, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 749.1935992240906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 775.7566404342651, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.9568033218384, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 823.5291147232056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 846.8979120254517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.6675186157227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.6992030143738, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.5499215126038, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.30191802978516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.3907198905945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.8447952270508, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.89888191223145, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.44207763671875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.51328325271606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.1712055206299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.742880821228, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.07775831222534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.4788784980774, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.0584034919739, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.96368169784546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.3220782279968, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.06127643585205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.7980737686157, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4852824211121, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.5212788581848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.2966365814209, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.9958391189575, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.95136404037476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.4340839385987, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.1915183067321, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.3401656150818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.1358428001404, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.3303999900818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.58319902420044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.2697629928589, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.9320030212402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.13200187683105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.73648214340204, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.340163230896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8992013931274, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.681282043457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.4419202804565, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.1244812011719, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.9852862358093, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.8313555717468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.1179184913636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.9057631492615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.0524706840515, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.4046406745911, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1921591758728, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.2057566642761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.1492824554443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.0143957138062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.1697616577148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.709755897522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1302428245544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.1910357475281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.775363445282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.4731216430664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.3780832290649, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.4747142791748, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.5912051200867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.9903993606567, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4051179885864, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.5265560150146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.5177640914917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.7726430892944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.5479989051819, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.1587152481079, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.58608627319336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.2886366844177, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.584801197052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.1860818862915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.9603180885315, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.6439986228943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.8236794471741, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.1798434257507, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.999837398529, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.2233567237854, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.9339208602905, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.9841628074646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.1315212249756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.271044254303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.8460793495178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.4313569068909, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.7401585578918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.2086386680603, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.2854375839233, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.8731212615967, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.9388794898987, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.63968276977545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.3755125999451, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.16239833831787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.0152039527893, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.9955177307129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.3323221206665, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.8127965927124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.4606394767761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.35039854049677, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.2704014778137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.3905658721923, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.89888525009155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.7204756736755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2705669403076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.3353614807129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.965922832489, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.09680032730097, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2867250442505, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.04431581497187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.80560302734375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3481631278992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.3953580856323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.5215964317322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.0185613632202, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0947160720826, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2012805938721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.8695993423462, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.7214369773865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.368001461029, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8206462860107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.92592048645025, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.1172814369202, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6531195640564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.2099194526672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.4550418853759, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.7300815582275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2699198722839, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2220783233643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.82479381561274, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.8185601234436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.5329594612122, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.9403228759766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.7735958099365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.0487985610962, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2185597419739, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.3185596466064, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1062.3534297943115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.0342388153076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1062.4356842041016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.0468873977661, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1064.6155261993408, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.8247957229614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1069.221749305725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.9679985046387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.5446405410766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2796778678894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.528639793396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5260772705078, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.5854444503784, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.81903886795044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.77504158020014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.271999835968, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.0537638664246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.637282371521, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.4011220932007, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.0612750053406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.4062356948853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1631984710693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.0724830627441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.7675175666809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.7327995300293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.287043094635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.4600005149841, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.4291191101074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7305564880371, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.2435231208801, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.0336012840271, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.5094399452209, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.7393579483032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4918365478516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.7483253479004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2278351783752, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.4897546768188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.170557975769, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.619366645813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.4067230224609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.6619181632996, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1164793968201, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.966881275177, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.1822395324707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.7065539360046, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.0073504447937, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.9081635475159, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.2500791549683, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.5897583961487, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.0678377151489, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.0827221870422, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.2992005348206, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.1617631912231, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.2819166183472, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.3295965194702, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.1264038085938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.069926738739, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7318429946899, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8723258972168, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.9729585647583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.588321685791, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.7486357688904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.0950417518616, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.8510394096375, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.8887991905212, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6012873649597, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.9747171401978, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.6312017440796, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.4865670204163, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.2233624458313, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.2689609527588, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.3409585952759, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.3220806121826, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.7095928192139, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.0620818138123, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.854875087738, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.9275240898132, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.8964757919312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.2609572410583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.1161527633667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.1668815612793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.7356848716736, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.5388803482056, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.3942399024963, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.5537557601929, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.7444744110107, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.5902457237244, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.7891201972961, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 748.4006404876709, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.0875182151794, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.783362865448, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.7100787162781, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.2998385429382, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.5044736862183, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.2467193603516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.5801577568054, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.1363124847412, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 976.3267183303833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.2817678451538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.0727949142456, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 963.3452892303467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 977.4545621871948, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 963.6254358291626, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.1567897796631, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.0110397338867, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.50240087509155, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.357916355133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6998448371887, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.6371216773987, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5580787658691, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.1281642913818, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.666081905365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.3694443702698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.26863908767706, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.442883014679, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.0769553184509, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.617434501648, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.4894433021545, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.6171169281006, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.7303972244263, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7929615974426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.7809543609619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.3731241226196, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.0220808982849, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.2668871879578, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3467154502869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.1715245246887, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.7403240203857, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7563166618347, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.7993555068969, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.1385631561279, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.3073592185974, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.0360064506531, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1366353034973, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.89808177948, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.785126209259, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.6876773834229, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.4948830604553, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 722.4427199363708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.443835735321, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.7939209938049, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.6385588645935, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.3308773040771, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.5303983688354, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.4428768157959, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.7195200920105, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 727.3657631874084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.9657597541809, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.6217575073242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.1848068237305, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 727.3036789894104, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.1179213523865, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.55231761932373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.3280005455017, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.1881637573242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.5577549934387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.58288097381586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.5655951499939, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.5694375038147, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7092761993408, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.1460728645325, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.7044768333435, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4916839599609, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.3792014122009, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.02255582809454, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0334372520447, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.7384028434754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.88031673431396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.9892807006836, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.2201590538025, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.9388790130615, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.812961101532, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.6601629257203, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.0335946083069, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3079957962036, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.905601978302, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.0860795974731, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.2968029975891, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.0737609863281, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.2716827392578, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4595184326172, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.4486441612244, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.6582417488098, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.4278411865234, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.4052748680115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.5606355667114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.9928016662598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.3808054924011, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4820823669434, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5819201469421, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.3048024177551, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.6622395515442, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.4428758621216, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 921.077766418457, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 799.976315498352, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.2801599502563, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.2292804718018, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 899.2993545532227, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.6057572364807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.8323135375977, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.9473638534546, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 916.7160034179688, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.7960052490234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 813.1622362136841, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.4372854232788, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 905.6481552124023, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.6884803771973, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.5590372085571, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.9599952697754, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 921.845760345459, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 792.2380781173706, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 806.2897539138794, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 934.5491170883179, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 911.2388849258423, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 802.2751998901367, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.4224004745483, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 933.939037322998, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.9467124938965, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 804.8980808258057, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.1740884780884, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.5601577758789, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 905.476803779602, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.7479944229126, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.7107224464417, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1155.146722793579, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1113.1217670440674, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1161.7456102371216, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1131.181116104126, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1160.6601572036743, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1122.8836727142334, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.5030460357666, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1122.039680480957, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.939359664917, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.340479850769, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.647843837738, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.4166374206543, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.0840015411377, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.9683218002319, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.1275210380554, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.7796859741211, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.9814434051514, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.3476805686951, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.1798391342163, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.1336011886597, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.9857606887817, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.5377550125122, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.1996831893921, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.6595230102539, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.9598388671875, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.800802230835, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.5363230705261, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.1683177947998, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.794882774353, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.5908823013306, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.1895971298218, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.7640008926392, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 878.3763217926025, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.5832018852234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 878.9993667602539, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.490394115448, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 871.176962852478, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.7972755432129, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 881.5416049957275, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.1699228286743, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.9774374961853, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.4067182540894, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 699.2614388465881, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.9470381736755, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.6521668434143, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.412965297699, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 697.3172760009766, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.633599281311, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.1723170280457, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.7616000175476, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 697.5601625442505, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.8230409622192, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 773.2115173339844, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.4091215133667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 699.9127984046936, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.9006400108337, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2502.188491821289, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.8932809829712, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2512.182397842407, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.8836770057678, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2522.870569229126, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.6814341545105, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2532.1879863739014, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.7032008171082, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1920.2425479888916, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1325.732479095459, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1939.4275283813477, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1349.6481609344482, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1926.4311695098877, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1330.7275199890137, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1933.9334392547607, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1356.2022304534912, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1929.1007709503174, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1340.8911895751953, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1943.335371017456, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1348.0604839324951, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1934.2028903961182, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1340.9367942810059, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1940.1208019256592, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1357.8107213974, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1880.2817630767822, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1173.8462400436401, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1879.7217464447021, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1180.1729536056519, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1884.256992340088, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1178.47008228302, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1885.5079936981201, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1177.8739166259766, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7463.56143951416, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 968.3332777023315, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7466.946144104004, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 971.491208076477, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7471.216354370117, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.8998441696167, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7484.053115844727, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 973.820629119873, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.0603189468383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.5432019233704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7871990203857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.6755204200745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4220843315125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.0828757286072, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5691194534302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.6315202713013, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.6953611373901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3081579208374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6657557487488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.8080010414124, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.2745566368103, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.2753643989563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5737566947937, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.0305614471436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.3521590232849, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1528029441833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3487992286682, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.6441602706909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.7936010360718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.6875176429749, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.4356846809387, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.2316741943359, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.8732786178589, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.5558385848999, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8894367218018, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.37440204620367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.27423620224, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.6806406974792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.393274307251, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.640962600708, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.5964832305908, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1604804992676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.3163232803345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.0208044052124, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.7168035507202, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.5582423210144, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.8864040374756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.0966382026672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.8177614212036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1848006248474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7328038215637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.9067215919495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.9183993339539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.1769638061523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.9708814620972, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.9177570343018, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.1201586723327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.42671489715576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.002076625824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.7334394454956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.0566415786743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.1297564506531, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.8737630844116, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.774561882019, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.2361621856689, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.37807846069336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.5304002761841, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.5033626556396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.346079826355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.6851277351379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.3148803710938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.1604771614075, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.3432030677795, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.5134401321411, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.462085723877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.9644742012024, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.9464030265808, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.3315148353577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 846.0407876968384, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.7017545700073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.519838809967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.0739245414734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.843514919281, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.9647979736328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.651035785675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.9764790534973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.2883253097534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 869.1459131240845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.9611196517944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.6940813064575, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.0358452796936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.5174417495728, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.4014358520508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.1568007469177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 840.9540748596191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 864.4908666610718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.457597732544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.672164440155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 799.4681644439697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 812.3716855049133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.0638403892517, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.0976028442383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 849.3897533416748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 863.1347179412842, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.1681609153748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0161519050598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.26015901565546, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.41263484954834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.2406373023987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.15791511535645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.2884788513184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.38400077819824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.9025583267212, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.8307199478149, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.51039934158325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.6041617393494, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.0499234199524, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.53055620193476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7684774398804, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.0728068351745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.6985578536987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1443204879761, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5921635627747, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.7665586471558, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.1644740104675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.7950429916381, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.9515175819397, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.8393659591675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1383996009827, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1119995117188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.6364817619323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.8902368545533, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.31135797500616, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.18607902526855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.68384075164795, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.67280292510986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.08863830566406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1692805290222, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.8734407424927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.3118405342102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.4140782356262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8049626350403, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.9657578468323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.3254432678223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.73616027832037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7931156158447, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.1039929389954, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.989116191864, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.85328197479254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4505615234375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.7296023368835, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.8580803871155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.11791610717773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1356792449951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.4089622497559, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.618236541748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.25583887100225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.5779175758361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.7707147598267, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.7715172767639, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.07519769668573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.34672498703003, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.9995188713074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.6859097480774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.08863830566406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.1902441978454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.9132761955261, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.1409616470337, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.4143967628479, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.8926396369934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.3665609359741, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.7492804527283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.8492832183838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.8615989685059, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.4248023033142, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.3923192024231, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.2492814064026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.6583905220032, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.0414423942566, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.7633609771729, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.3524785041809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.2523250579834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.0996813774109, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.5651183128357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.48128175735474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2774386405945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0020785331726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1777606010437, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.571834564209, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.6465578079224, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.7819232940673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.70928049087524, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.5284829139709, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.9577641487122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.9107189178467, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.6623873710632, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.5209593772888, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.2940773963928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.650399684906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.7259216308594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1737585067749, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.3169584274292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.0273613929749, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.1987223625183, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8983969688416, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2604737281799, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.61439895629877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.5743975639343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.98480129241943, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.410722732544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1321592330933, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7683200836182, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.7859172821045, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2052798271179, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.9408025741578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.12415552139277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.6259245872498, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.5412817001343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.5956816673279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.821605682373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.3931188583374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.1377568244934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2900881767273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1286382675171, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.0137605667114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.1913628578186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.2851228713989, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.5982403755188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6113533973694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.2795233726501, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.7131214141846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.86400222778315, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.9575972557068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.0246353149414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.5750379562378, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.9230403900146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.5692782402039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.7980813980103, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.8292784690857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8326387405396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 911.382246017456, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.5283236503601, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 916.3305568695068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.5051231384277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 911.9937562942505, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.3777542114258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 917.0539140701294, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.0640029907227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.8227243423462, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3945631980896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.1113624572754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.2366423606872, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.46063899993896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.7503967285156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4950385093689, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.5180807113647, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.6387186050415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.33759784698486, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.8115224838257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.2659134864807, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.0312042236328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.320963382721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.0151944160461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.80751609802246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.02127790451044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.5423974990845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.9673624038696, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2340755462646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0611200332642, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.78319644927984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.26656055450434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.9806394577027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.4031987190247, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.402717590332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.9006357192993, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.5092749595642, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.2947187423706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6726412773132, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.2830410003662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.7892818450928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.29776811599737, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.2628793716431, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.9375996589661, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7718396186829, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.5196785926819, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.3888006210327, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2310423851013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.896800994873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9084792137146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.3180751800537, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.5859222412109, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4838376045227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.6188836097717, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.336323261261, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.0769605636597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.9233565330505, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.6889595985412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8731160163879, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.733606338501, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.0943970680237, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.3582458496094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.047679901123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.0582389831543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.2377610206604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.9881567955017, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.3598389625549, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.9878396987915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.3865571022034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.5782384872437, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2952008247375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.5272002220154, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.2681570053101, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.0187201499939, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.5612816810608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.3899164199829, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.1167988777161, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.3593559265137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.8574390411377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.5643219947815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.8384051322937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9324798583984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.8036775588989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.9542379379272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.4033517837524, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.6160001754761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.5244822502136, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.4601588249207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.5915231704712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.3617601394653, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2588787078857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.1147179603577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.4596800804138, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.3079977035522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.7511978149414, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.2209610939026, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.9425644874573, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.1087989807129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.9316835403442, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.9375991821289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 663.3572793006897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.4929623603821, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.7884798049927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.6187205314636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.3569579124451, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.7187194824219, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 841.4267301559448, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 856.6420888900757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.2080006599426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 767.4051213264465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.3054423332214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.6800031661987, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 842.3431968688965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 855.0816011428833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.937593460083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.4742398262024, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 795.1915144920349, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.3919949531555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 846.965274810791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 867.673282623291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.7649598121643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 764.3028783798218, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 797.3545622825623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.5052742958069, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 848.290228843689, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 857.3958396911621, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.0340809822083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.4139151573181, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 795.2583932876587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.8270406723022, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.01232099533075, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.79759502410894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.6967992782593, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.68416070938105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.111361503601, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3140811920166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7512021064758, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.4587211608886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.7281560897827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.1086401939392, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.0422358512878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.16959285736084, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.19583940505987, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.6660771369934, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.3158383369446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.3355212211609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.99631547927856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.6396827697754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.3582434654236, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.6889629364013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.9361605644226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.14399766922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9347257614136, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9889621734619, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.6281623840332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.8576021194458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.6638445854187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.976957321167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.1071991920472, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.9208083152771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.4289622306824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.3260803222656, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.1131181716919, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.492965221405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.710563659668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.795521736145, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.7111916542053, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.9142355918884, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.6327929496765, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.7796816825867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.4331192970276, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.4910373687744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.3044838905334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.5264058113098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.513918876648, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.952793598175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.9996786117554, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.3222427368164, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9145617485046, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.5529561042786, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.8420767784119, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.1195178031921, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.7169585227966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4046363830566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.990243434906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.8153614997864, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.3097653388977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.0239996910095, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.7958421707153, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.9412751197815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2214398384094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3345670700073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.67520570755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.9497632980347, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 760.5332803726196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.3624029159546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.7915177345276, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.4875197410583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.448320388794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.3750386238098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.7713632583618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.5176048278809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.4043173789978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.0036773681641, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.0135998725891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.2551989555359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.0670385360718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.7351975440979, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.5424003601074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.7137637138367, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.2350430488587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.312798500061, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.0489640235901, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.817120552063, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.7508826255799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.2951946258545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4185585975647, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.182719707489, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.45024108886713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4958367347717, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.097442150116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.6062393188477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.6286401748657, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4654388427734, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.8657531738281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1257605552673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.3217673301696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2273635864258, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.8561539649963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3079986572266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.5998420715332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.0371170043945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.5483183860779, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.5024003982544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.47920083999634, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.4395189285278, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.3895998001099, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6382393836975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.0168023109436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5456047058105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.6683201789856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9942359924316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.9411172866821, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.6548810005188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.1296005249023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1164746284485, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.661600112915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.5753617286682, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6084814071655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3305597305298, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.0164804458618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.7988777160645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.0115189552307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.0311970710754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.8952045440674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.0660743713379, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.7788844108582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1071.846718788147, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.9227204322815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1073.27054977417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.7475185394287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1077.9750394821167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.9838395118713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1082.78799533844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.2513628005981, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.9088034629822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.5451216697693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.7721562385559, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.4798412322998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.4169611930847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.4590377807617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.9686427116394, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.2668771743774, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.9203171730042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.9015970230103, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9767956733704, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.41599893569946, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.9601616859436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.8236804008484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.4391965866089, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.3846373558044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.4899158477784, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2964816093445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1459202766418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5657544136047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.7374496459961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.9068813323975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1609625816346, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.8931188583374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.3432011604309, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.2140784263611, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.2542414665222, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.3694443702698, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.9828805923462, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.747838973999, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.6524796485901, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.8112020492554, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.0780849456787, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.8292856216431, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.2932806015015, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.5326428413391, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.5865559577942, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.7563209533691, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.31951379776, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.3539175987244, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.2407975196838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.070234298706, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.3153643608093, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.4620747566223, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.9492840766907, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.7636799812317, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.2308721542358, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.5460758209229, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.0940809249878, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.8001565933228, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.2923202514648, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.1092820167542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.316162109375, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.8847994804382, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.1929640769958, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.6766428947449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.2127981185913, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9396743774414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.9163198471069, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.7244849205017, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.1000027656555, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.6432046890259, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.3168048858643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 697.8276777267456, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.5575909614563, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 736.1916756629944, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.3156781196594, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.1681618690491, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 760.9014439582825, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.5169653892517, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.6296038627625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.6849594116211, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.6320023536682, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.751838684082, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.7249598503113, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.1246409416199, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.2083191871643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.6545634269714, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.7238349914551, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.9833588600159, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.46000623703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.9748768806458, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.6273593902588, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 736.2687969207764, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.9307174682617, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.6622433662415, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 771.6876792907715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.9667205810547, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 985.7188749313354, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1001.572003364563, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 986.2617588043213, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1001.8129587173461, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 983.9632034301758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1008.9296007156372, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 982.8913688659668, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1008.8852834701538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.2398409843445, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7081623077393, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.8484811782837, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5873599052429, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.8672013282776, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.153920173645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.7104015350342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.2427201271057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.4833607673645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2651166915894, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.2048015594482, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.9199995994568, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.8700737953186, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.6121587753296, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.2052764892578, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.724956035614, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.2187190055847, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2000026702881, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.2348818778992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.7931289672852, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1852769851685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.7321586608887, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.602560043335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.2657580375671, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.541437625885, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.257764339447, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.159679889679, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.5339164733887, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.5971202850342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.7740797996521, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.2430362701416, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.6649675369263, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.5172824859619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.4684772491455, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.6668810844421, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.143358707428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.9372806549072, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.7983999252319, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.4020833969116, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.2817621231079, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.4838371276855, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.606876373291, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.8448038101196, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.4238381385803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.5772738456726, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.9470405578613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.1017565727234, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.5865573883057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.616641998291, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.2857613563538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.0148758888245, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.3879985809326, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.8740773200989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.887363910675, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.52383756637573, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.3430366516113, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1487998962402, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.8084759712219, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.0822372436523, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.7280020713806, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.91200256347656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7006411552429, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2462372779846, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.051203250885, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.9691200256348, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6312012672424, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.7724823951721, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.3990454673767, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.6767983436584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.2070407867432, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5502390861511, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.9102444648743, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.374400138855, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.2923183441162, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.8424010276794, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.711042881012, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.529757976532, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.2929558753967, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.2484798431396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.4345560073853, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.8401598930359, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5531277656555, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.2745552062988, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.687361240387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.0030460357666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.8001623153687, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.0979166030884, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.6596803665161, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.8708782196045, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 933.3428907394409, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 806.6062498092651, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.8475284576416, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.0080099105835, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 920.7534551620483, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.7260751724243, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 826.9105577468872, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.6179246902466, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 944.681601524353, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 816.4420700073242, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 833.6480093002319, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.1675186157227, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.2828874588013, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 812.500467300415, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 829.5297622680664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 943.0897617340088, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 946.083517074585, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.0067253112793, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 836.5208101272583, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 940.1592016220093, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 928.2118368148804, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.1395235061646, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 823.1782293319702, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 941.5383958816528, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 949.1524744033813, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 812.5603246688843, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.959997177124, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.2051267623901, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.173749923706, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 809.9843168258667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 828.2486486434937, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1178.4764766693115, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1130.32320022583, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1186.4342308044434, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1136.905426979065, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1182.4462461471558, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1138.4574460983276, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1184.6959972381592, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1137.0959901809692, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.3907208442688, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.6041626930237, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.4972853660583, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.6107177734375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.5347218513489, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.3566408157349, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 686.3699221611023, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.3311982154846, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.8641610145569, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.5596785545349, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.007040977478, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.1008014678955, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.029914855957, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.5635209083557, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.5428824424744, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.0348806381226, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.082558631897, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.2852754592896, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 686.1870408058167, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.2113585472107, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.5435152053833, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.8268756866455, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.6777606010437, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.5371189117432, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 883.8755130767822, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 764.1984009742737, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 895.3737640380859, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 767.0913600921631, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 887.8481578826904, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.7481570243835, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 894.3195199966431, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.0977635383606, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.8739252090454, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.1700768470764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.6553606987, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.7859153747559, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 780.9352016448975, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.0686411857605, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.4377593994141, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.7073659896851, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 783.101761341095, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.9715209007263, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.5652809143066, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.4166445732117, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.0716853141785, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.2886385917664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.1583938598633, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.3241620063782, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2521.7492961883545, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.7732825279236, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2530.7081508636475, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.0651245117188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.6868801116943, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.168155670166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2553.4073638916016, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.295684337616, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1943.988962173462, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1345.328164100647, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1949.3907451629639, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1361.799349784851, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1949.2251110076904, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1346.7343950271606, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1955.9214305877686, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1368.5054445266724, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1952.2484683990479, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1354.713110923767, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1957.517900466919, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1372.2995233535767, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1952.1230792999268, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1352.1305513381958, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1958.2923316955566, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1372.8006410598755, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1865.508975982666, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1185.5326461791992, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1865.8688163757324, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1190.4919862747192, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1867.5232028961182, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1188.845591545105, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1871.9436836242676, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1190.1843166351318, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7485.676460266113, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 976.5776109695435, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7486.932640075684, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 971.8083143234253, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7492.872543334961, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 970.3041553497314, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7513.126907348633, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 977.3367929458618, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.0385522842407, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.8475217819214, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.2756776809692, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.4318375587463, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1097.4406433105469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.076476097107, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1042.0361518859863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1065.3776025772095, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.7980847358704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.800323009491, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.2513580322266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.0417566299438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1086.9603252410889, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.87824344635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1068.0244779586792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1090.270881652832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.4615964889526, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.2632012367249, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 716.8827199935913, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.6560020446777, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1094.0374422073364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1058.231987953186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1060.8553504943848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1104.711675643921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 689.6995186805725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.4683265686035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.254876613617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.7734365463257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1090.0416040420532, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.1808004379272, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1068.3619260787964, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1093.8209676742554, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.2345566749573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 759.2289614677429, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 838.7801551818848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 971.5772771835327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1048.3521604537964, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1026.6931247711182, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1091.0323190689087, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1234.908151626587, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.1403198242188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.4292802810669, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 837.9812812805176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 974.6336030960083, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1048.88032913208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.3462371826172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1093.2025575637817, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1229.9054336547852, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 721.9833564758301, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.1385598182678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 835.3201770782471, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 973.6555194854736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1046.0542345046997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.1334390640259, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1088.440146446228, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1228.6219310760498, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.4796800613403, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.0143985748291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 836.3923168182373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 965.1556777954102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1049.1747093200684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1026.3545608520508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1084.7732782363892, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1237.6918363571167, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 992.0294380187988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1112.4440050125122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1639.2615985870361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1693.176622390747, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1284.312801361084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1281.3150358200073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1837.6454257965088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1877.070083618164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1000.7039928436279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1105.8030366897583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1652.9142379760742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1679.1923236846924, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1287.8228902816772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1284.1814374923706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1829.1524696350098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1878.6201763153076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1002.0670366287233, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1107.559208869934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1646.8219089508057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1691.1127853393555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1302.864327430725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1271.6780757904053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1829.8835182189941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1877.126865386963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1010.4499101638793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1111.6452836990356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1626.6140604019165, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1701.1876964569092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1311.0716772079468, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1289.9491214752197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1828.4331321716309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1878.756332397461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.7854385375977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.6414394378662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.7289657592773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.1070423126221, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.9587235450745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.6572847366333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.9707188606262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.4900779724121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.7484803199768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.5270400047302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.261438369751, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.9302401542664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.4255986213684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.6576008796692, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 683.5809588432312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.5174403190613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.8935985565186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.2814407348633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.9567999839783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 722.4324798583984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.4417643547058, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.6425614356995, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 683.450882434845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.0975980758667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.4260764122009, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.4814395904541, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.8704028129578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.3076801300049, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.0713586807251, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.9750428199768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.8915209770203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.9763164520264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.6404790878296, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.9841585159302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 966.28737449646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 973.398232460022, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.9844861030579, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.8084750175476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 988.4087991714478, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.3472099304199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.8305583000183, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.2603168487549, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.8867177963257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 969.8819208145142, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.8454384803772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.9564805030823, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 962.2441625595093, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1000.7641649246215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.2964754104614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.5947179794312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 935.3392028808594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 962.4923229217529, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 718.6081647872925, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 748.2320046424866, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 962.2454452514648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1006.4216041564941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.0060791969299, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.2631993293762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 923.5367965698242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 965.030722618103, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.9232001304626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.3897614479065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.1070442199707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 990.4635190963745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 976.5908813476562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1307.876968383789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.9668741226196, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1143.5702323913574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.4764785766602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1294.7480010986328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 858.624005317688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1150.023045539856, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.3345623016357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1303.1838464736938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 857.4694442749023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1149.169921875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.4745588302612, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1308.8129568099976, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 848.7980842590332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1142.75887966156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.397759437561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.963041305542, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.9703960418701, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.7083191871643, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.1889634132385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.5752000808716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.2022399902344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.457441329956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.4924812316895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.1862373352051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.4017615318298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.4884757995605, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.5337543487549, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.0561580657959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.9014372825623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.5244817733765, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.3443269729614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.3439984321594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.4753613471985, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.5897603034973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.072639465332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.4684782028198, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.6260833740234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.2465620040894, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.7089614868164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.0382509231567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.1743969917297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.7763223648071, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.905125617981, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.6401629447937, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.8627166748047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 731.5620803833008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.9873633384705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.725435256958, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 869.6388721466064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.3446378707886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 786.9830417633057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 794.2708778381348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.727518081665, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 837.8744029998779, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 824.7273540496826, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.3700828552246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 771.9555163383484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 768.9867234230042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.4281620979309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 836.9027185440063, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 827.0968055725098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.0323195457458, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 771.158881187439, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.3134398460388, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.259684085846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 827.6155185699463, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.9415969848633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.2118453979492, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 769.3417620658875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 768.8142418861389, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1696.726245880127, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1095.3347253799438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1690.3497505187988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1012.583203315735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1687.821445465088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1016.8332862854004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1680.4811096191406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1016.026406288147, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.8689570426941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 718.7489604949951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.7094402313232, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.6454372406006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.6563220024109, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.649760723114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.165602684021, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.4902367591858, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.8849611282349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 676.0153603553772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.7264060974121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.930558681488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.1463971138, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.0579223632812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.0336036682129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.7243223190308, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.3604731559753, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.7030358314514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.768479347229, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.6012787818909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.1963195800781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.5147228240967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.9017601013184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.2670373916626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 908.1606340408325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 856.5286493301392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 879.2572736740112, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.7097587585449, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 881.1470413208008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 816.742889881134, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 877.0390462875366, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 809.639356136322, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.5998406410217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.9131183624268, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.7036828994751, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.5089664459229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 802.1657562255859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 722.7801632881165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 734.0424060821533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.6350388526917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.8772802352905, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.1888031959534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.6971211433411, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.5595231056213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 783.0963206291199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.9271984100342, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.0092821121216, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.9528021812439, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.4572839736938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.0369625091553, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.7460803985596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.8643155097961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.1459245681763, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.2678399085999, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 731.0860800743103, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 758.3113598823547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.0057597160339, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.3947229385376, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.6780800819397, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.7806429862976, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 782.7446389198303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.9985599517822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.7785639762878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.8047943115234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.6348867416382, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.4726419448853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 736.4572787284851, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1042.372169494629, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 782.5702381134033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.8771224021912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 830.9343957901001, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1109.3494319915771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.0964789390564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.7124767303467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.6393599510193, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1027.21200466156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 782.3151993751526, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.1779174804688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.4380836486816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1104.9745559692383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.8883213996887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.8678402900696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.7575988769531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1022.3705530166627, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.6702418327332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.3361625671387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 802.0587277412415, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1118.440637588501, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.4745554924011, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.5785574913025, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.1488018035889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1038.227367401123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 781.5470385551453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.268159866333, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 806.988480091095, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1101.4806365966797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 950.0555229187012, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1310.5729627609253, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1369.9715328216553, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 958.7740802764893, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1227.5124883651733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1264.5772743225098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 954.484486579895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1321.5358352661133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1373.0083227157593, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.434720993042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.0912103652954, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1272.1817636489868, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 940.1297616958618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1315.9067153930664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1377.9009437561035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 954.7270393371582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1204.5113611221313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1271.9702434539795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.7540836334229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1308.7649631500244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1371.9257545471191, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 949.5920038223267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1208.7875175476074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1279.6580696105957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.9225625991821, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.5030355453491, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.1164793968201, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.9249606132507, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.0934371948242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.1435217857361, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.5502362251282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.4315180778503, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.2571177482605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.6041569709778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.6470394134521, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.0974354743958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.4431972503662, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.4862418174744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.6593637466431, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.4078392982483, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.9287972450256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.9838366508484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.5484828948975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.9659194946289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.932954788208, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.4839997291565, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.2364768981934, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.5532846450806, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.3433589935303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.0244832038879, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.6204776763916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.126874923706, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.3841619491577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.6316757202148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.3606395721436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.306236743927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.8017630577087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.1667213439941, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 775.5905604362488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.4523162841797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.2667217254639, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.483362197876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.90895652771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 783.9528012275696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.753598690033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.25727891922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.4220838546753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.037919998169, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.8118419647217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.4139165878296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.2171149253845, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 785.7353639602661, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.7551975250244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.2155237197876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.6702446937561, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.327362537384, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.3673620223999, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.6428823471069, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.5719995498657, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.8832001686096, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.5694408416748, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.6315197944641, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 758.9847946166992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.5702438354492, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 685.7436776161194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.151674747467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.9441566467285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.57967710495, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.0300731658936, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1167.2147226333618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1013.7425661087036, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 902.6828861236572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.4001607894897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1180.5481576919556, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.2075262069703, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 901.6348791122437, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.3236880302429, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.938554763794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1022.9980802536012, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 896.7939186096191, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 809.9787211418152, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1171.892008781433, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1028.0190324783325, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 893.1148767471313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.4886426925659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.6440000534058, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.2859253883362, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.1676826477051, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.7723145484924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.1627197265625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.645441532135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.4489598274231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.156955242157, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.5331196784973, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.1622338294983, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.8876781463623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.1713590621948, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.9614410400391, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.0238375663757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.7977585792542, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.2014403343201, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.9959983825684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.0420851707458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.8334455490112, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.3577599525452, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.0859212875366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.3745636940002, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.1040000915527, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.1937599182129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.3727989196777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.7201552391052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.9391975402832, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.7889609336853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.3163194656372, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.1294393539429, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.8649578094482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.2844815254211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.6995224952698, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.5838370323181, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.9513621330261, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.3766403198242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.0257635116577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 676.814079284668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.0062355995178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.0303955078125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 685.895037651062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.6356806755066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.6977553367615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.1334414482117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.0091199874878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.6371192932129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.4497594833374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1468.002233505249, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.894889831543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1499.080638885498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.6987228393555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1503.625283241272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.1233558654785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1509.8982429504395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 816.5620756149292, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.7532801628113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.4809622764587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.292640209198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.4980754852295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.4260830879211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.1028738021851, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.7606468200684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.2907228469849, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.5968012809753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.5480027198792, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.0720009803772, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.5219230651855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.4859156608582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.3099131584167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.9305653572083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.4486413002014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.1262426376343, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.6668844223022, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.0465569496155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.9127988815308, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.4727964401245, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.6990361213684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.1230397224426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.3364825248718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 923.6857509613037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.2328023910522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 919.8958349227905, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.1665639877319, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.2558460235596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.5401582717896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 928.5524845123291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.236319065094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.6393632888794, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.5728001594543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.1913542747498, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.3193593025208, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.6684856414795, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.3065605163574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.0428791046143, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 724.2015957832336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.1747217178345, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.2009625434875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.031834602356, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.4167985916138, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.1892762184143, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.6353578567505, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.8255987167358, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.3033609390259, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.1230430603027, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.8726420402527, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.6031999588013, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.0008015632629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.5527997016907, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.4808020591736, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.4268751144409, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.0497584342957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.675678730011, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.5540781021118, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.8072023391724, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.7966361045837, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.5872020721436, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.8929567337036, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.705436706543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.0990471839905, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.7598390579224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.2548713684082, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.3843216896057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.6667141914368, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 832.005124092102, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 796.3936042785645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.6142435073853, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 799.5707249641418, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.0219202041626, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.6415944099426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 826.2671899795532, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 807.5595235824585, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.5057621002197, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.8775968551636, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.7739253044128, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.4758358001709, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 824.1686344146729, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 813.2708883285522, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.5820846557617, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 803.3567953109741, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 760.6950426101685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.9761538505554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 831.2022352218628, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 824.2894315719604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1034.8675298690796, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1076.834397315979, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1033.165111541748, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1080.110559463501, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1036.677598953247, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1071.342568397522, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1042.8159952163696, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1076.5801572799683, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.6777558326721, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.9086394309998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.0649647712708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.6707172393799, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.0371150970459, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1881642341614, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.5228805541992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.407518863678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.1622333526611, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.0038371086121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.4260754585266, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.2479968070984, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.6116786003113, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.41952085495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.3054394721985, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4022407531738, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.2796859741211, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.0583939552307, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.5739192962646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.6763167381287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.7742366790771, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.5411267280579, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.3063945770264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.3121604919434, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.4526419639587, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.7542443275452, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.4134411811829, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.8220868110657, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.557918548584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.8028812408447, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.9056015014648, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.7588810920715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.8371257781982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.4841628074646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 796.2043261528015, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.6214365959167, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.9256019592285, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.8385629653931, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 795.064799785614, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.0252842903137, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.1275177001953, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.3449602127075, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 800.9657621383667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.6259236335754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.9332809448242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.4929566383362, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 805.6567907333374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.897759437561, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.1820769309998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.79807472229, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.8779211044312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.330237865448, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.0456070899963, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.7457642555237, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.7774424552917, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.6425533294678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3000011444092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.3555235862732, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.4048018455505, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.995677947998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.1623992919922, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.880482673645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.5331153869629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.3972868919373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.8963179588318, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.6204743385315, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.6216015815735, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.6313576698303, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.3305568695068, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.3137636184692, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.923357963562, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.0307250022888, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.0262379646301, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.4380788803101, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.4899139404297, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.6489672660828, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.3430376052856, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.3649625778198, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.8425641059875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.3886432647705, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.3171191215515, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.084005355835, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.9067215919495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.9126448631287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.4215984344482, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.4398412704468, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.7977557182312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.0243263244629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 954.6132850646973, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 983.7791872024536, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.0809669494629, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.9438381195068, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 947.2068691253662, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.7796764373779, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 835.5209541320801, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.000955581665, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 954.2955207824707, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.6404790878296, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.1310415267944, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.5799932479858, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 946.4985513687134, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 964.4591951370239, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 831.9080018997192, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 841.8902492523193, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.1108884811401, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 996.4556884765625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.5036773681641, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 859.8441696166992, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 948.9329624176025, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 969.2694330215454, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.833607673645, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.882402420044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 964.3443155288696, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 998.8769674301147, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 862.4568033218384, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.3609409332275, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.7219305038452, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 973.5979223251343, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.3876829147339, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.493595123291, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1247.4396705627441, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1196.6036748886108, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1253.0902481079102, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1203.191204071045, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1259.3569564819336, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.327359199524, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1262.1897602081299, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1210.5126333236694, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.1572790145874, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 689.6748733520508, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.9174389839172, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 686.6585659980774, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.3212814331055, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.4791984558105, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.0124807357788, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.5126419067383, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.4780807495117, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.6030468940735, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.6009602546692, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.299684047699, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.5684823989868, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.6448016166687, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.4639978408813, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.6072058677673, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.9710445404053, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.3824005126953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.6932830810547, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.6959962844849, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.5201606750488, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.6991958618164, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.1486377716064, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.62624168396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 924.7968101501465, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.4439988136292, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 929.9596834182739, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 800.6940770149231, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.9561529159546, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 803.1054401397705, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 928.0075168609619, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.3619208335876, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.538405418396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.9500823020935, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.8307242393494, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.3406448364258, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 812.0356750488281, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.1200032234192, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.8224005699158, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.6519947052002, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.1145706176758, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.0332770347595, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.5864043235779, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.0294361114502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.8027200698853, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.3356828689575, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.9905648231506, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.3902406692505, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2633.9260864257812, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.5137639045715, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2653.2561588287354, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.3406429290771, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2671.504011154175, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.7409543991089, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2676.8460655212402, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.1438336372375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1986.9647979736328, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1382.1910429000854, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1993.8891220092773, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1394.860486984253, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1995.9868907928467, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1387.7804803848267, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2005.5671977996826, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1404.0884685516357, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1998.2713508605957, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1390.7809686660767, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2006.6262435913086, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1403.6855936050415, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2000.3953552246094, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1385.6865692138672, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2011.1102676391602, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1401.5947246551514, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1879.7185611724854, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1219.4129657745361, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1878.4107398986816, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1224.373435974121, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1885.1270198822021, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1227.7771139144897, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1892.499189376831, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1234.9451208114624, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7550.028991699219, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 995.6875085830688, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7541.615829467773, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 998.9518451690674, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7552.024002075195, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1006.8326473236084, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7590.907745361328, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1004.2790222167968, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2325.2532863616943, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2296.137933731079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2365.2686405181885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2490.0865650177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3382.0705795288086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3328.2355308532715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3364.321632385254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3459.93408203125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2300.016326904297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2301.934242248535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2368.136806488037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2471.312484741211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3389.721736907959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3397.2830390930176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3444.3581008911133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3542.7696228027344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2287.9634952545166, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2306.846227645874, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2372.6742362976074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2473.692150115967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3400.797920227051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3404.9420738220215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3436.693572998047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3542.7255821228027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2277.599334716797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2302.4955081939697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2368.753261566162, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2461.8998622894287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3392.7294158935547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3403.560676574707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3439.5680046081543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3539.375057220459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2369.559679031372, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2488.936014175415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2749.2487812042236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3197.5519943237305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3345.940628051758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3309.0960121154785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3521.715679168701, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4038.7622451782227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2348.0203247070312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2466.6383838653564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2696.7115211486816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3143.889112472534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3351.6892623901367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3307.6143836975098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3513.3383560180664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4006.330547332763, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2350.494394302368, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2457.4228858947754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2722.036647796631, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3140.5150413513184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3346.654853820801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3311.755790710449, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3511.783981323242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4017.422027587891, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2339.5408153533936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2447.5622177124023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2712.127857208252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3127.0948791503906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3350.947322845459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3311.8330001831055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3508.85347366333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4016.8814468383794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3278.179054260254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3665.1455879211426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5271.618709564209, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5403.523712158203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4104.797115325928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4187.295684814453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6038.239364624023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6186.793899536133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3291.3073348999023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3584.54158782959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5321.689910888672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5469.034729003906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4076.164455413818, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4172.8862380981445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6044.528961181641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6216.968612670898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3341.503086090088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3582.666721343994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5337.356605529785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5500.024948120117, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4104.877471923828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4180.167636871338, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6074.777431488037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6231.737880706787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3335.456771850586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3566.8740463256836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5343.808937072754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5500.974235534668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4102.738361358643, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4181.780014038086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6076.649761199951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6225.860290527344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1962.5072002410889, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.7179107666016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2076.2822341918945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2298.2894229888916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2183.277425765991, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2096.8153762817383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2161.693925857544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2338.120641708374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1856.0558605194092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1898.8311862945557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1980.9948635101318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2200.7787227630615, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2191.509437561035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2007.4171161651614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2067.5657653808594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2255.6475162506104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1842.3112106323242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1887.1766376495361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1982.7358436584473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2195.7697582244873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2180.8512210845947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1979.1953563690186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2053.30096244812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2263.948497772217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1829.9763298034668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1877.648983001709, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1973.911533355713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2179.156322479248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2159.1721534729004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1967.6132678985596, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2047.0966434478762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2262.039031982422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2076.0449600219727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2408.923215866089, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3096.556329727173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3132.0811080932617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2250.733766555786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2355.063190460205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3062.4307250976562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3213.1310176849365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2000.9524917602537, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2306.1835193634033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2765.110397338867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2922.087516784668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2225.341272354126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2221.8268871307373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3034.3875122070312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3171.729145050049, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2001.184320449829, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2300.3471851348877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2771.8313598632812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2931.5155124664307, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2233.65008354187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2222.94864654541, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3042.3180961608887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3191.5511798858643, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1989.2753219604492, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2291.641607284546, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2773.565902709961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2929.502239227295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2218.038396835327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2214.0537548065186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3035.0564861297607, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3182.324962615967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3193.9623737335205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4151.05598449707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2707.0990562438965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3603.6607933044434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3053.8827228546143, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4110.69278717041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2620.7814407348633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3602.5545501708984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3066.490068435669, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4123.734569549561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2618.026714324951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3613.1824111938477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3052.3884677886963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4118.141632080078, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2604.8092937469482, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3612.070083618164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1822.4259090423584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1874.323844909668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2015.3076648712158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2319.6843242645264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1903.288974761963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1827.4851322174072, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1902.9497337341309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2261.23646736145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1751.5752220153809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1790.8558368682861, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1863.1598567962646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2037.8790569305418, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1859.9424266815186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1760.6377506256104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1853.454704284668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2193.4414291381836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1737.6414394378662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1779.098720550537, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1850.3230381011963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2037.8512001037595, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1853.8227272033691, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1748.0233573913574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1850.3932857513428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2187.209596633911, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1732.4563312530518, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1776.4003276824951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.5606327056885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2046.5270519256592, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1846.9128227233887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1739.2776107788086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1839.2604732513428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2190.531349182129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1924.9524688720703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2712.617120742798, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2740.6832122802734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2072.7961921691895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2395.9286403656006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2441.562900543213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1862.3895931243896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2307.3287963867188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2353.3585262298584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1972.4908828735352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2182.807836532593, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2259.4164752960205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1856.1369514465332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2304.610252380371, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2343.8676834106445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1969.2479801177979, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2197.0961380004883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2255.193281173706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1850.4859161376953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2301.2273597717285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2338.0764961242676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1963.9484786987305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2194.8512077331543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2248.5825538635254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5152.507476806641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3510.769805908203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5001.980152130127, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3111.588478088379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5021.744918823242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3100.3017807006836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5033.472805023193, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3095.7464027404785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1899.7974395751953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1989.0390491485596, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2155.0366401672363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1821.1745738983154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1757.2502517700195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1873.7697505950928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1796.9629001617432, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1787.0764827728271, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1792.626085281372, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1696.8927955627441, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1660.7660675048828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1697.2890949249268, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1783.3083248138428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1789.6009731292725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1797.2740745544434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1672.0020771026611, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1654.3060684204102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1677.418556213379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1793.548812866211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1792.2803211212158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1782.6718521118164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1662.3835182189941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1659.9935913085938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1679.461441040039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2749.5820713043213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2582.4508666992188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2474.4275283813477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2281.0542488098145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2492.0643424987793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2294.744634628296, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2490.7241439819336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2291.5478515625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1648.95601272583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1632.6124954223633, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1698.1339168548584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1813.8212776184082, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2105.393114089966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1983.3755207061768, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2031.1073875427246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2109.0611267089844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1683.9328002929688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1686.5449619293213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1752.660322189331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1875.2216148376465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2051.291847229004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1966.0094547271729, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2028.983039855957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2110.2267169952393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1700.674877166748, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1689.3655967712402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1758.1743907928467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1870.62593460083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2049.725294113159, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1975.6360054016113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2030.1319885253909, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2113.5256004333496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1703.7476921081543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1687.9036903381348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1756.5970993041992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1874.8363208770752, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2054.7152042388916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1972.1846294403076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2017.085762023926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2112.240810394287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1884.852647781372, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1812.8759860992432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2065.0228881835938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2825.2681636810303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2098.7470531463623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1988.223533630371, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2228.924627304077, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3039.3081378936768, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1938.3608150482178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1848.5539150238037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2081.45263671875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2842.858896255493, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2141.501922607422, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2002.2487735748289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2205.667200088501, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3079.0238189697266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1939.9072074890137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1843.797435760498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2082.3193645477295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2848.2295989990234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2139.2094230651855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1998.5345554351807, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2220.647678375244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3078.7990283966064, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1933.6048030853271, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1843.5536003112793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2075.6279945373535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2857.696475982666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2131.0843181610107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1994.0660572052002, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2208.0428886413574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3088.5987091064453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2494.1913509368896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3575.256824493408, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3731.499786376953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2524.120969772339, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3368.1519889831543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3493.7064170837402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.136486053467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3594.0499687194824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3755.416603088379, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2597.4507331848145, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3355.1088333129883, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3512.6542472839355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2541.4427375793457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3603.0031967163086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3766.8772888183594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2598.8531494140625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3362.179698944092, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3514.2855644226074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.0563106536865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3607.271041870117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3767.915325164795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2596.689920425415, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3368.184814453125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3514.6094703674316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1466.7193603515625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1315.921926498413, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1363.7137603759766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1489.0710401535034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1563.353443145752, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1410.4839992523193, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1455.1815938949585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1589.663519859314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1472.6380681991577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.1510362625122, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1373.6737632751465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1484.6588850021362, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1563.9300870895386, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1420.680809020996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1462.9547309875488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1590.7044792175293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1474.281120300293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.7479982376099, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1372.403998374939, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1484.756326675415, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1556.3782405853271, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1420.3697633743286, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1459.2766427993774, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1596.321930885315, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1464.4580745697021, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1319.2007970809937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1369.5771169662476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1478.9636850357056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1551.454553604126, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1415.4937601089478, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1457.6584100723267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1592.440013885498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1782.2732830047607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1570.7563304901123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2008.481435775757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2061.759834289551, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1740.7611274719238, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1565.6084775924683, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2070.2472019195557, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2120.11905670166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1801.8465518951416, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1579.83247756958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1994.480333328247, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2071.0822200775146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1755.3910636901855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1549.2913484573364, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2041.7860889434817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2127.9401779174805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1801.5620708465576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1576.0849714279175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.2606525421143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2084.2167949676514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1752.4904155731201, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1547.5499153137207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2045.433759689331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2117.548942565918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1792.079210281372, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1572.1385622024536, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.6809558868408, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2078.2939434051514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1754.0628719329834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1538.94784450531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2046.2993526458743, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2124.567337036133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2197.849750518799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3193.8487911224365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2662.544479370117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2404.2737674713135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2154.6292972564697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3192.3340702056885, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2675.840139389038, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2419.6913719177246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2160.6534671783447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3193.9822578430176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2671.11008644104, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2421.355199813843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2164.7603130340576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3193.477268218994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2682.7462482452393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2421.277904510498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1262.282075881958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1201.5086364746094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1422.7737617492676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1465.2592086791992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1411.0251140594482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1211.991844177246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1426.995348930359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1457.250075340271, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1260.540795326233, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1204.4737720489502, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1410.3353548049927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1414.775996208191, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1445.5942392349243, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1215.5359888076782, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1445.9395265579224, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1455.5521726608276, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1259.032006263733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1202.6862335205078, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1409.5865678787231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1415.9948873519897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1458.6974430084229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1213.9878416061401, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1438.3478355407715, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1459.9865627288818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1255.8263969421387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1198.6267185211182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1411.665916442871, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1416.563835144043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1455.6579303741455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.1926383972168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1438.5614347457886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1456.0063982009888, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1599.780478477478, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1674.1366481781006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1665.4646492004395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1638.871350288391, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1568.3475160598755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1652.6535987854004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1678.5615921020508, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1625.1198530197144, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1568.5704040527344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1654.5060920715332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1675.6305503845215, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1630.347981452942, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1558.626732826233, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1653.868808746338, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1675.4491329193115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1637.54798412323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3723.2805252075195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2257.303991317749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3780.8773231506348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2106.2124824523926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3792.2976875305176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2102.2619342803955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3819.5900917053223, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2096.620168685913, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1272.847843170166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1321.9567966461182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.2099304199219, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1636.2148666381836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1142.5174236297607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1166.6289710998535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1239.5072078704834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1303.6526489257812, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1302.2471952438354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1705.6990337371826, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1180.5571269989014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1189.1742515563965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1238.2996797561646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1306.4175939559937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1309.250078201294, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1708.7465476989746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1183.8438367843628, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1185.7729530334473, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1227.967824935913, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1300.3692817687988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1311.3686323165894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1683.2121562957764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1183.9692735671997, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1182.3622417449951, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2217.439832687378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1681.178903579712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2210.387706756592, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1578.6976099014282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2194.725122451782, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1576.905426979065, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2222.7692699432373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1566.0979223251343, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1391.3511991500854, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1326.030879020691, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1394.6120071411133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1702.0334434509277, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1469.5316743850708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1418.9839887619019, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1492.0841598510742, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1889.5489692687988, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1465.7860803604126, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1405.6204748153687, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1473.4236860275269, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1732.0811176300049, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1514.0839862823486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1469.3316745758057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1560.0097703933716, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1913.1121635437012, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1468.4047985076904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1407.507038116455, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1477.4684762954712, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1734.9180698394775, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1505.677604675293, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1465.8600044250488, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1551.3670349121094, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1911.4406299591064, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1472.2489643096924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1406.6676807403564, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1475.433759689331, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1738.7915229797363, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1506.9355249404907, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1463.9012813568115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1554.9022483825684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1917.3750305175781, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1627.5609731674194, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1889.9177742004395, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1858.1926345825195, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1848.3979225158691, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2052.4164867401123, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2084.5350456237793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1703.5097694396973, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1926.6430377960205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1928.4545707702637, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1876.5331077575684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2061.3913536071777, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2096.091833114624, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1698.87375831604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1943.1470489501953, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1924.9762916564941, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1876.066541671753, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2075.983829498291, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2122.257432937622, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1705.8124732971191, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1953.1328105926514, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1918.957290649414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1878.0246257781982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2080.8945655822754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2123.6931324005127, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2629.4921493530273, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2685.147657394409, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2702.979507446289, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2819.9939155578613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2699.8854446411133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2820.817451477051, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2721.4918422698975, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2827.228488922119, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.5131225585938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1037.5414419174194, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1220.5158424377441, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1223.4491205215454, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1152.209768295288, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1065.4771184921265, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1388.2999992370605, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1394.9768114089966, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1084.7675275802612, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1066.4967966079712, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1241.1831998825073, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1210.3449630737305, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.6248006820679, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1088.4873628616333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1395.546236038208, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1402.5727939605713, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1084.226884841919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1067.0102453231812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1246.3929605484009, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1215.821294784546, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1173.4830379486084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1084.2636823654175, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1388.6584043502808, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1385.0230360031128, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1081.43967628479, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1064.3951988220215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1236.0428762435913, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1219.1348791122437, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.184947013855, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1084.03968334198, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1391.0931253433228, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1387.564001083374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1641.020975112915, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1325.3019332885742, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1895.453462600708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1529.0574502944946, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1713.3363246917725, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1349.8348760604858, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1972.0801734924316, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1509.5516729354858, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1719.552812576294, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1342.7391862869263, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1983.0603313446045, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1520.4592037200928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1719.7990322113037, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1343.1979084014893, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1978.858060836792, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1513.9113664627075, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 924.419846534729, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1015.1006412506102, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 984.9289560317993, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.1942405700684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 998.3777523040771, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.9843111038208, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.6153650283813, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.9428758621216, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 978.4814262390137, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 951.4891242980957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1012.5552034378052, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 970.5814456939697, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 934.7078466415405, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1026.7471981048584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 983.0995178222656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 949.6609592437744, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1015.2871990203857, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 974.1088008880615, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 935.1436853408813, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1021.562876701355, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.0430450439453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.3177680969238, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1009.8452806472777, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 972.2926330566406, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1321.053442955017, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1291.3400077819824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1347.4276781082153, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1307.739839553833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1346.5800046920776, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1306.2964820861816, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1350.1857662200928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1312.5079917907715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1218.6171197891235, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1038.818564414978, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1231.9736003875732, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.2744035720825, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1244.9772882461548, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1063.8995265960693, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1258.5089683532715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1064.3272066116333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1641.5592002868652, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1737.5435066223145, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1553.2987213134766, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1575.1219129562378, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1630.8640050888062, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1689.3091297149658, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1540.603518486023, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1544.8532819747925, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1666.7198276519775, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1775.0424194335938, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1585.210394859314, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1601.134557723999, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1660.9853076934814, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1726.457748413086, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1563.943510055542, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1587.3844861984253, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1668.0526447296143, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1786.3716888427734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1583.89967918396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1608.2689571380615, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1663.1523132324219, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1749.2984008789062, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1564.810242652893, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1584.557285308838, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1676.6187191009521, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1791.620798110962, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1585.8390522003174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1604.8867177963257, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1668.8651371002197, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1741.0388660430908, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1550.6168031692505, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1593.7572717666626, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2199.7723293304443, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2105.3575897216797, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2271.6017532348633, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2201.5503787994385, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2278.3329582214355, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2194.853754043579, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2277.670087814331, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2203.658227920532, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1149.4248008728027, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1201.0124731063843, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1114.7900915145874, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1109.6756744384766, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1131.1153602600098, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.0936031341553, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1161.6267204284668, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1212.163200378418, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1123.1193590164185, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1124.148645401001, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1149.4571208953857, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1058.7790203094482, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1162.6384019851685, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1211.0390329360962, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1120.8777523040771, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1129.510407447815, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1147.9241609573364, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1063.5831928253174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1174.305911064148, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.2958402633667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1129.4958400726318, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1135.2355241775513, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1154.0955114364624, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1058.0985641479492, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1639.860315322876, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1420.9270286560059, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1669.1107082366943, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1458.9315223693848, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1670.8129501342773, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1453.2006549835205, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1672.5340747833252, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1462.3507118225098, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1355.4844760894775, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.0417585372925, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1215.4121589660645, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 895.5651235580444, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1386.5420818328857, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1022.2633600234985, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1249.7433519363403, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 901.5065574645996, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1401.9988918304443, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1030.3695964813232, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1235.3534317016602, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 898.5804748535156, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1402.422399520874, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.8955125808716, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1244.788475036621, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 908.5278415679932, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4880.365619659424, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.4084749221802, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4853.734874725342, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1020.901927947998, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4877.472496032715, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1040.196795463562, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4915.046195983887, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1046.9940662384033, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2231.3545513153076, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1594.9457597732544, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2256.235990524292, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1611.6910457611084, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2252.9142475128174, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1625.1665496826172, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2273.8785552978516, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1639.686689376831, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2256.4112091064453, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1625.8951950073242, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2277.885446548462, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1642.6534271240234, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2279.5150470733643, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1631.4622402191162, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2299.007034301758, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1662.7503967285156, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1952.05246925354, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1384.4667196273804, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1963.1043338775635, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1399.660964012146, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2000.750379562378, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1400.0662326812744, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2086.5492725372314, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1417.965440750122, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8151.043663024902, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1137.5107192993164, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8101.377296447755, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1147.5318479537964, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8126.138877868651, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1155.546555519104, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8187.997932434081, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1172.4057626724243, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4501.772003173828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4476.681880950928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4595.272674560547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4824.999027252197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6491.373138427734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6451.748886108398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6519.189758300781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6694.749221801758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4445.0982666015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4513.853130340576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4607.033271789551, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4758.890552520752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6519.580955505371, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6620.930976867676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6660.732536315918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6863.767547607422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4385.213603973389, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4488.21346282959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4589.728145599365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4719.666290283203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6550.609931945801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6623.67244720459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6695.572891235352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6868.200759887695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4349.499378204346, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4481.035556793213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4566.498260498047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4689.734401702881, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6534.305877685547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6595.338973999023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6661.113128662109, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6858.8945388793945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4602.259044647217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4837.270107269287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5330.09407043457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6223.501930236816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6476.223373413086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6443.577041625977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6813.443222045898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7823.028411865234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4516.089916229248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4762.248134613037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5140.965576171875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6062.722225189209, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6474.412040710449, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6450.106506347656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6828.327674865723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7763.375625610352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4491.726551055908, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4710.494422912598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5138.614902496338, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6004.417095184326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6473.797721862793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6457.386703491211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6795.108413696289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7782.135162353516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4452.567825317383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4676.65132522583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5112.550506591797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5957.404079437256, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6474.9260330200195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6447.209930419922, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6796.417465209961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7806.230392456055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6332.819709777832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7135.026893615723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10207.952690124512, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10461.305046081543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7903.491401672363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8124.693450927735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11747.277526855469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12013.495712280273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6241.043319702148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6917.86865234375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10258.932838439941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10536.24095916748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7833.250579833984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8106.434211730956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11732.614250183105, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12013.905944824219, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6245.468158721924, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6870.0947189331055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10290.87963104248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10575.134773254395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7885.059509277344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8121.0345458984375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11767.666091918945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12055.329704284668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6243.568458557129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6835.126037597656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10312.058029174805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10589.687614440918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7896.106338500977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8122.471122741698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11756.274185180664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12066.599807739258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3793.312530517578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3828.1289863586426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4013.2694244384766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4440.006904602051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4149.987201690674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4081.1844825744624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4182.929744720459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4452.187042236328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3523.9007568359375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3603.867988586426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3802.4729537963867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4189.268283843994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4114.420166015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3937.844982147217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3997.6576042175293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4250.606575012207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3431.3582038879395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3543.093090057373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3768.696460723877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4149.169750213623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4069.345607757568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3769.8972702026367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3991.5707397460938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4240.783576965332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3385.4092407226562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3507.2334480285645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3736.8077087402344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4120.274543762207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4071.1187171936035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3779.008140563965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3910.220470428467, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4233.52819442749, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3967.543830871582, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4669.752979278564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5991.007862091064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6032.094097137451, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4294.205303192139, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4483.299198150635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5904.977283477783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6238.525276184082, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3837.508964538574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4345.179653167725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5309.876937866211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5604.533748626709, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4191.069240570068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4218.344631195068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5845.201778411865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6097.564868927002, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3794.8515129089355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4336.195507049561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5316.790561676025, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5619.11600112915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4163.519535064697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4168.171539306641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5857.406425476074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6119.179668426514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3754.4336128234863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4299.7100830078125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5316.496448516846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5622.731857299805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4150.717430114746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4162.398376464844, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5858.257732391357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6120.5659103393555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6175.119705200195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8049.147415161134, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5206.583499908447, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6975.384483337402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5865.055198669434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7960.373420715332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4995.047397613525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6953.708953857422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5860.794429779053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7963.560791015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5025.825271606445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6950.38818359375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5831.2006187438965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7974.543342590332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5018.5211753845215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6963.239974975586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3511.988945007324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3601.803379058838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3880.9281730651855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4476.836833953857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3647.3213958740234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3526.692314147949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3649.1643142700195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4374.498043060303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3328.064708709717, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3378.2118225097656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3554.215850830078, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3834.5805168151855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3489.000473022461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3372.9222297668457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3561.5489959716797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4130.829277038574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3264.5939445495605, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3322.091064453125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3520.376625061035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3820.1326751708984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3445.870590209961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3332.6084327697754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3524.8219108581543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4128.100337982178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3232.072582244873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3296.494083404541, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3494.9300575256348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3819.941749572754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3415.5188941955566, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3307.6962089538574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3500.6948471069336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4127.466907501221, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3685.0516510009766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5334.110870361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5270.118885040283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3945.6153678894043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4625.023670196533, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4706.414222717285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3490.0711822509766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4267.937145233154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4407.587203979492, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3694.404468536377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4087.0539855957027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4233.083534240723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3445.5129432678223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4267.350101470947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4374.905776977539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3669.6294593811035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4086.6883659362793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4229.765281677246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3425.679397583008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4261.112632751465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4378.818035125732, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3669.196300506592, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4090.2126121521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4229.567489624023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9747.15045928955, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6737.411727905273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9281.757926940918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5949.799861907959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9250.796775817871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5940.980663299561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9325.881004333496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5935.764141082764, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3639.725818634033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3866.80477142334, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4113.134059906006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3515.4472160339355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3507.6281929016113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3597.716808319092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3372.3005867004395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3266.4663696289062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3320.0749015808105, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3252.4467277526855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3069.560136795044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3151.444139480591, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3343.192958831787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3266.6601753234863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3302.898406982422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3184.32767868042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3060.3897380828857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3122.3334407806396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3354.7313690185547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3264.2067527770996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3299.488925933838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3152.651844024658, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3049.893922805786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3127.1352100372314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5217.597427368164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4897.487201690674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4590.868148803711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4204.952335357666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4624.66064453125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4243.705749511719, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4650.420455932617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4257.638244628906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3118.718252182007, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3088.4513664245605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3228.6662673950195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3433.58154296875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3766.613426208496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3607.0473289489746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3705.599250793457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3869.3489265441895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3205.5910301208496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3174.9499320983887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3300.84716796875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3504.136791229248, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3701.3529777526855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3630.0294303894043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3729.504623413086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3860.379066467285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3223.0859375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3172.332019805908, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3299.848003387451, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3501.541748046875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3716.8892860412598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3623.1249809265137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3718.340435028076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3857.16495513916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3225.405445098877, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3168.0207920074463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3287.7656173706055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3495.246696472168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3722.9819297790527, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3623.827476501465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3765.1364517211914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3869.289722442627, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3466.744804382324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3419.8892784118652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3874.754066467285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5302.344951629639, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3880.952663421631, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3643.459529876709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4094.6449661254887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5723.1366539001465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3581.3921546936035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3455.153121948242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3870.364513397217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5342.457275390625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3942.040557861328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3695.1289558410645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4058.352012634277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5768.589763641357, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3583.4195518493652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3446.4673233032227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3868.288993835449, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5359.234390258789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3942.243881225586, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3693.241901397705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4050.8732604980473, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5783.154544830322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3594.0830039978027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3432.964973449707, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3857.1899032592773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5359.710693359375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3938.7405014038086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3692.9572677612305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4045.120162963867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5794.701290130615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4703.193759918213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6730.025863647461, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7034.223175048828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4736.618900299072, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6349.532470703125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6558.7089920043945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4776.399993896484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6765.701751708984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7063.69930267334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4804.255352020264, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6335.873928070068, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6592.266006469727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4789.127044677734, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6776.029205322266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7080.133476257324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4868.7005043029785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6334.608936309814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6615.4522705078125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4789.416313171387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6781.78581237793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7088.938446044922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4863.076515197754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6337.70658493042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6600.793609619141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2720.5924701690674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2483.000135421753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2557.7582454681396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2807.9740715026855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2887.930555343628, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2642.5675296783447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2729.7830390930176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2942.9398441314697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2714.682083129883, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2481.077461242676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2546.5123176574707, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2770.3656101226807, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2895.2795219421387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2607.7709007263184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2708.1854248046875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2939.1771125793457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2714.30495262146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2478.200340270996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2546.5188694000244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2772.149600982666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2881.8273735046387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2604.538402557373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2704.7207927703857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2940.7270431518555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2694.829921722412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2463.8759994506836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2545.979347229004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2763.0006408691406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2867.0900535583496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2610.8263969421387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2702.8379344940186, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2941.325922012329, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3304.3449783325195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2946.0659313201904, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3681.593132019043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3774.7881507873535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3171.1609649658203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2925.623025894165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3788.6430168151855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3931.0774993896484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3327.6171684265137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2923.1019115448, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3621.9423866271973, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3753.1246376037598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3165.8862495422363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2857.4206352233887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3717.0596885681152, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3865.8922958374023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3334.316005706787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2924.689598083496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3620.1977729797363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3770.849952697754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3176.141757965088, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2844.6001625061035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3720.8348655700684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3876.608295440674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3328.4422492980957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2904.8958492279053, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3614.6722984313965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3763.5817527770996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3183.7396717071533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2835.171184539795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3723.5833740234375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3868.010883331299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4164.863548278809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6000.172481536865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4880.343532562256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4500.551853179932, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4057.3843002319336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5990.242042541504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4934.727382659912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4516.934585571289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4065.2088165283208, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5999.169750213623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4946.841106414795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4526.129913330078, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4067.943077087403, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6001.995334625244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4967.490997314453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4524.410858154297, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2329.4815921783447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2254.2701053619385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2658.5729789733887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2751.10577583313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2524.0403175354004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2273.901767730713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2633.316650390625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2693.74080657959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2323.999032974243, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2201.540126800537, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2587.860336303711, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2608.096332550049, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2547.7801513671875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2203.648633956909, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2632.353458404541, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2671.669120788574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2317.3291301727295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2192.876319885254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2584.748487472534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2614.9190425872803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2570.39870262146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2208.9948749542236, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2629.9943828582764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2675.008478164673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2308.3233642578125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2189.1340732574463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2578.7911891937256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2599.204158782959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2569.041585922241, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2208.9455795288086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2635.618886947632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2671.4688396453857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2987.042074203491, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3056.736011505127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3008.3004760742188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2971.368474960327, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2881.4502239227295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2984.9358463287354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2952.034397125244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2931.6843223571777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2864.007501602173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2988.917293548584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2983.052167892456, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2929.928960800171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2858.8468837738037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2982.063512802124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2966.4201641082764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2931.808490753174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6786.209259033203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4189.369297027588, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6801.674156188965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3874.8891258239746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6815.809783935547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3881.0673904418945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6844.6452713012695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3891.468029022217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2265.273609161377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2397.8164863586426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2418.663845062256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2885.739040374756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2054.7451210021973, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2186.054229736328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2173.60990524292, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2342.2713661193848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2374.5987224578857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2956.7659187316895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2123.346529006958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2159.929599761963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2144.9292850494385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2342.829303741455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2371.6324615478516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2988.371047973633, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2117.5566387176514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2162.1068954467773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2130.33935546875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2333.969268798828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2377.947368621826, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2985.001745223999, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2125.984516143799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2162.61775970459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3956.224308013916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2983.755865097046, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3867.3764419555664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2749.0939140319824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3878.766269683838, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2768.2540893554688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3888.159713745117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2770.9875106811523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2484.5174503326416, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2374.1059017181396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2539.62703704834, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2870.965929031372, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2581.8467235565186, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2532.2443294525146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2655.8878326416016, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3146.6694355010986, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2662.3427200317383, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2554.0430450439453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2682.876787185669, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3022.5736045837402, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2697.899351119995, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2644.7134399414062, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2758.72145652771, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3216.949586868286, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2692.0255851745605, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2567.3808097839355, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2686.133451461792, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3010.448799133301, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2698.8494396209717, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2647.906713485718, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2774.6555137634277, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3212.5121212005615, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2699.0243339538574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2562.335367202759, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2680.5871772766113, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3009.105110168457, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2693.1356811523438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2642.436475753784, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2773.9401626586914, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3224.5347118377686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2914.9937629699707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3213.763484954834, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3195.325756072998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3100.2360248565674, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3518.490734100342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3605.5848503112793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3126.227045059204, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3347.797565460205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3368.3764839172363, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3192.1017742156982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3548.5290908813477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3611.4699363708496, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3145.4678440093994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3370.285243988037, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3372.3502349853516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3200.5257606506348, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3559.3400382995605, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3610.050106048584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3136.630268096924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3415.272846221924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3368.572940826416, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3204.909152984619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3591.006908416748, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3615.6894493103027, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4395.183029174805, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4496.532001495361, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4660.5768394470215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5022.982711791992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4740.288944244385, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5031.897106170654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4859.23770904541, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4996.597099304199, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1837.0198440551758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1847.7694511413574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2085.965919494629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2163.585786819458, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1974.6622467041016, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1893.5058879852295, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2340.477924346924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2382.181463241577, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1877.7051258087158, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1887.201747894287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2059.0100860595703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2104.0542602539062, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2019.3516826629639, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1909.5312023162842, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2363.4921550750732, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2375.039358139038, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1874.095516204834, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1888.3601570129395, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2072.476644515991, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2095.943841934204, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2019.9107265472414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1898.3524703979492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2372.332181930542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2383.620481491089, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1872.3390483856201, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1884.1473484039307, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2087.944812774658, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2105.9988689422607, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2013.5446262359617, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1886.812505722046, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2378.599843978882, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2380.7683277130127, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2775.089111328125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2277.700490951538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3207.937431335449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2549.154224395752, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2907.276153564453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2255.669937133789, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3343.5718154907227, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.282398223877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2934.0635204315186, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2262.919521331787, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3355.393753051758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.775676727295, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2961.8897247314453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2277.4760246276855, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3375.8998489379883, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2550.0185775756836, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1565.0550413131714, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1646.5396690368652, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1675.9718418121338, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1606.018214225769, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1627.3336029052734, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1635.5547332763672, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1586.362247467041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1650.9264183044434, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1584.6915197372437, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1628.764820098877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1653.036642074585, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1560.931043624878, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1574.4308805465698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1652.5438404083252, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1599.7606468200684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1625.2513694763184, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1654.4591999053955, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1568.3742380142212, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1571.705436706543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1653.3078384399414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1605.5446290969849, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1622.8382301330566, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1659.5971393585205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1579.017915725708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2166.190881729126, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2123.4655952453613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2211.9351863861084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2147.7110385894775, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2219.3915271759033, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2153.4294605255127, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2228.9200019836426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2167.0872020721436, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1966.635971069336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1737.1729850769043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2008.9507389068601, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1778.6039733886719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2019.129123687744, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1781.6024017333984, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2029.2740917205808, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1782.5032043457031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.0310287475586, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2666.646890640259, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2471.5545558929443, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.7724628448486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2532.98752784729, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2597.9281425476074, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2434.9660682678223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2513.2185554504395, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2705.142068862915, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2823.927993774414, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2657.149305343628, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2678.7892627716064, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2674.005756378174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2743.731346130371, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2597.96462059021, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2627.9177570343018, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2704.514560699463, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2824.0627479553223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2649.4464015960693, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2679.812641143799, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2675.7040119171143, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2755.8220958709717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2577.6774406433105, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2647.2836875915527, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2714.5683193206787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2838.8308811187744, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2642.5849628448486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2716.9918537139893, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2686.0366249084473, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2765.557279586792, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2579.8083114624023, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2658.470239639282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3431.207695007324, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3298.8564682006836, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3651.2719917297363, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3542.6519775390625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3669.569263458252, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3562.3171615600586, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3684.4561767578125, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3579.6459007263184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1751.9115161895752, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1850.3244972229004, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1764.5791816711426, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1708.96879196167, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1747.1977710723877, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1674.3115043640137, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1815.3483200073242, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1897.80366897583, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1804.3208026885986, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1775.0193786621094, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1806.7799949645996, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1726.9966316223145, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1816.215991973877, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1896.871042251587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1812.8505611419678, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1771.8852710723877, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1807.6800060272217, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1733.6508750915527, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1821.3190460205078, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1906.015043258667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1816.7331218719482, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1778.5372734069824, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1813.5104084014893, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1739.2193603515625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2545.1257705688477, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2166.2945556640625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2622.8647994995117, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2274.1382598876953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2633.0740547180176, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2281.5617656707764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2651.868963241577, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2294.6596717834473, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1978.2046031951904, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1526.226725578308, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1776.9603061676025, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1344.694414138794, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2063.998727798462, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1531.3457584381104, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.1820621490479, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1348.5841608047485, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2066.7555046081543, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1536.5121603012085, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1852.3998069763184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1350.2444696426392, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2092.8347206115723, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1556.6660737991333, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1864.3438339233398, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1353.5115242004395, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6773.487854003906, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1424.6358346939087, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6673.906211853027, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1453.8748836517334, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6728.685111999512, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1456.8828773498535, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6761.390686035156, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1472.1300888061523, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3791.300811767578, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2728.6046409606934, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3810.3568077087402, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2764.152822494507, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3982.0172691345215, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2759.61199760437, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4016.6202926635738, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2780.1638317108154, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3986.1649322509766, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2757.867670059204, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4030.6641769409175, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2795.4072093963623, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4012.9912948608403, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2762.097930908203, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4059.8062133789067, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2789.9479961395264, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3559.799041748047, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2370.920629501343, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3588.8417625427246, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2467.440481185913, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3609.7025299072266, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2469.7105503082275, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3722.0630645751953, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2487.5911903381348, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14228.325958251953, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1904.384651184082, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14099.525146484375, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1947.159194946289, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14190.080337524414, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1940.0942420959473, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14338.91616821289, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1979.7633647918701, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8921.394233703613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8802.946014404297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9057.368621826172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9526.80866241455, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12715.99422454834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12744.107322692871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12912.143478393555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13213.87321472168, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8757.198257446289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8826.296920776367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9035.49201965332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9384.94197845459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12826.043853759766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13063.629684448242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13176.929473876953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13547.579498291016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8562.346229553223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8751.338386535645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8932.509841918945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9215.843048095703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12884.165267944336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13066.070175170898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13105.992965698242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13587.589263916016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8442.51392364502, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8688.856315612793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8834.5552444458, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9102.604522705078, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12871.039352416992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13059.170684814453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13189.483337402344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13590.643157958984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9061.984252929688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9576.330604553223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10495.016326904297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12295.045127868652, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12813.764152526855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12702.4361038208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13477.784576416016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15426.7919921875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8857.227821350098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9389.389457702637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10090.14087677002, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11900.856666564941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12820.61538696289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12721.660537719727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13451.278076171875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15264.25537109375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8689.59270477295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9208.91586303711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9898.239784240723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11677.989959716797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12820.185546875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12756.160278320312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13455.092391967773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15398.92219543457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8581.95785522461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9075.857429504395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9794.788780212402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11617.53776550293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12823.20816040039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12718.7788772583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13457.485580444336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15387.900390625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12477.331886291504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14081.559143066406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20124.072494506836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20633.803253173828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15470.345306396484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 16005.506439208986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23119.6044921875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23613.29620361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12175.71418762207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13572.221145629883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20156.85791015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20669.051666259766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15368.141555786133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 16007.25715637207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23097.970123291016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23648.968658447266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12014.785919189453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13399.975509643555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20195.578079223633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20713.53401184082, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15402.4267578125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15997.926177978516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23131.79039001465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23689.181365966797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11965.028533935547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13307.84194946289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20258.31329345703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20784.565353393555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15465.474090576172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15989.4140625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23174.753875732422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23728.900299072266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7546.294174194336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7568.195343017578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7923.584327697754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8749.089050292969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8156.69376373291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8025.545616149902, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8213.760108947754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8755.23151397705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6950.532646179199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7112.708511352539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7482.588653564453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8287.855911254883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7990.293960571289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7578.736877441406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7790.513763427734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8375.339813232422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6740.573768615723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6941.6357421875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7350.207786560059, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8133.879165649414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7889.267845153809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7540.074844360352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7835.670738220215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8361.945991516113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6580.885848999023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6851.1309814453125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7268.830757141113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8079.184684753418, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7842.289123535156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7507.826461791992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7775.049247741699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8308.173217773438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7817.067337036133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9218.370704650879, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11759.391288757324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11810.13786315918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8414.02816772461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8783.094100952148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11585.993309020996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12300.49674987793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7541.93473815918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8485.203742980957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10423.292846679688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11003.731269836426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8146.733779907227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8287.874145507812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11492.636642456055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11990.254974365234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7391.43123626709, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8313.518524169922, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10433.409767150879, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11014.082832336426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8043.992576599122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8194.612579345703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11498.256568908691, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12000.086059570312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7293.100166320801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8318.657836914062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10443.184051513672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11035.109596252441, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7988.163070678711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8183.712196350097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11509.198417663574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12019.485397338867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12139.013366699219, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15865.266647338867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10217.134170532227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13726.988296508789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11457.092399597168, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15657.847671508789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9824.922752380371, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13647.091827392578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11340.995063781738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15649.838333129883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9744.701538085938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13659.579238891602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11291.09058380127, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15675.315170288086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9724.14836883545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13675.612258911133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6971.566390991211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7113.076438903809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7648.276901245117, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8820.225677490234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7229.855995178223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6963.268165588379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7202.675132751465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8622.655181884766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6504.2303466796875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6584.717979431152, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7016.745948791504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7520.47248840332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6859.061660766602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6674.165191650391, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7014.051818847656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8101.257133483888, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6339.448337554932, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6413.9667320251465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6943.759346008301, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7514.387397766113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6747.604789733887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6609.2072677612305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6952.904357910156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8100.79490661621, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6278.879680633545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6360.822906494141, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6900.798492431641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7515.477447509766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6684.498138427734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6521.629772186279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6897.267150878906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8096.141395568849, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7281.971702575684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10587.998847961426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10339.899940490723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7742.278938293457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9166.457901000977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9231.913146972656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6824.395790100098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8372.547225952148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8600.730628967285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7255.618019104004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8035.645332336426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8318.728866577148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6702.249336242676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8365.238456726074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8584.620590209961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7159.531440734863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8029.272994995118, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8313.734893798828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6661.586112976074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8377.605934143066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8609.520874023438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7152.236480712891, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8027.976570129395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8317.921905517578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 19114.003982543945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13265.303421020508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17829.020767211914, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11673.614959716797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17603.336029052734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11601.312675476074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17561.44073486328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11599.622917175293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7192.3652267456055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7864.924049377441, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8102.756042480469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6971.127395629883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7029.443130493164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7102.93643951416, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6697.6287841796875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6398.336429595947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6489.144706726074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6449.186134338379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5987.6518630981445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6105.659523010254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6545.23551940918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6390.511150360107, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6478.512191772461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6296.991806030273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5969.944667816162, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6084.979248046875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6510.368194580078, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6382.5983810424805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6471.653060913086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6211.999187469482, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5970.493412017822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6107.236766815186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10194.469947814941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9579.85164642334, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9062.497444152832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8250.183715820312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9135.401916503906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8306.723518371582, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9164.434585571289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8349.431533813477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6121.373729705811, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6090.390205383301, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6353.202228546143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6753.126373291016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7294.752655029297, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7045.125885009766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7218.518829345703, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7486.556282043457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6254.959506988525, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6212.639198303223, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6448.1660079956055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6837.817077636719, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7171.026039123535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7075.902557373047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7259.60994720459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7493.586235046387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6241.973743438721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6197.01473236084, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6431.444911956787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6808.745765686035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7193.955230712891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7066.846237182617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7258.921318054199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7492.333564758301, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6230.764503479004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6199.041404724121, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6458.719806671143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6806.267929077148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7212.549591064453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7148.083686828613, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7307.613067626953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7545.159759521484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6729.412612915039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6685.7780838012695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7571.610336303711, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10270.978965759277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7515.844802856445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7138.77742767334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7922.033538818359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11108.142204284668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6868.645668029785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6707.935218811035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7496.704216003418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10306.874313354492, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7572.888145446777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7205.68416595459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7890.619659423828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11146.778182983398, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6881.10237121582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6668.924942016602, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7475.246238708496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10353.437767028809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7569.817924499512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7162.161865234375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7877.142066955566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11191.718864440918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6920.898704528809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6696.620178222656, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7438.426780700684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10335.148887634277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7605.741882324219, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7227.794189453125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7880.672836303711, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11194.249229431152, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9092.849006652832, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13065.132369995117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13673.971862792969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9176.814422607422, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12326.59294128418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12724.54574584961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9136.300964355469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13081.178359985352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13677.448806762695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9162.568740844727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12296.844215393066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12764.417724609375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9241.964797973633, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13118.581771850586, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13715.647201538086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9291.53938293457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12299.37858581543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12792.325477600098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9259.955673217773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13144.612884521484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13740.750503540039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9291.710815429688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12300.87875366211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12773.201866149902, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5242.7069091796875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4901.962261199951, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5063.720798492432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5439.4683265686035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5579.942569732666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5271.495342254639, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5415.397090911865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5692.954082489014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5232.861309051514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4865.835494995117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5024.890384674072, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5351.528491973877, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5565.492839813232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5200.496368408203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5363.077774047852, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5708.000183105469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5220.245895385742, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4853.94157409668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4998.821907043457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5347.2536277771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5551.515197753906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5169.713611602783, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5344.7577476501465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5707.862224578857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5228.631858825684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4847.395648956299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4995.970230102539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5356.981258392334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5588.656024932861, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5217.306385040283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5371.806106567383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5757.880477905273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6374.928455352783, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5721.066875457764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7129.097137451172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7331.862869262695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6120.778541564941, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5677.575969696045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7334.541664123535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7632.723159790039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6367.702560424805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5563.396320343018, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6996.033744812012, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7236.088905334473, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6037.512664794922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5594.335498809814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7228.446426391602, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7462.286186218262, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6387.316837310791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5550.875873565674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7007.556991577148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7255.130271911621, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6111.2566566467285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5567.767181396484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7211.662521362305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7479.581184387207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6367.520980834961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5573.444633483887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7009.959182739258, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7256.384353637695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6131.035995483398, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5559.947204589844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7206.420783996582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7470.553131103516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8085.631370544434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11637.265968322754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9331.630821228027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8726.62338256836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7806.331748962402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11600.500411987305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9464.11075592041, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8722.527809143066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7812.524185180664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11622.32608795166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9484.233207702637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8741.754837036133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7826.434783935547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11628.552474975586, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9588.116226196289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8744.411087036133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4491.222190856934, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4425.335216522217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5109.991054534912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5280.927200317383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4831.8500900268555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4541.970062255859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4993.255176544189, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5140.538215637207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4453.301086425781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4286.255798339844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4835.577583312988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4900.55534362793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4821.284484863281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4346.50016784668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4982.926731109619, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5070.86238861084, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4441.084957122803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4240.1971435546875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4846.314373016357, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4904.797763824463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4864.926738739014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4392.61100769043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5000.529594421387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5082.287502288818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4434.372482299805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4230.29691696167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4841.675186157227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4902.29024887085, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4846.726722717285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4387.500782012939, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5005.563011169434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5075.220642089844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5815.10124206543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5965.20622253418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5757.874736785889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5741.0747146606445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5578.766269683838, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5770.555324554443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5509.973278045654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5658.3909034729, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5547.990741729736, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5775.943870544434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5542.866897583008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5665.225315093994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5526.2873458862305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5775.366554260254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5504.351215362549, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5663.755855560303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12964.179992675781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8015.7244873046875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12815.894813537598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7490.6207275390625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12833.862991333008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7495.19718170166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12879.483184814453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7530.080261230469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4275.015678405762, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4513.440628051758, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4638.223667144775, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5394.699821472168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4033.9766693115234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4261.799182891846, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4064.9882888793945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4321.758270263672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4393.400497436523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5475.376110076904, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4015.00093460083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4076.5660667419434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4004.2167663574214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4328.8886642456055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4389.354667663574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5572.663154602051, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3990.904312133789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4087.2072219848637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3955.257110595703, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4314.093475341797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4385.994548797607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5557.371349334717, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4011.101741790771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4089.041652679444, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7387.913932800293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5673.145446777344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7068.588218688965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5149.834060668945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7103.826675415039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5171.249618530273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7122.047653198242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5179.616603851318, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4721.483535766602, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4626.976776123047, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4942.101402282715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5558.856315612793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4936.938877105713, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4886.602687835693, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5091.270713806152, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5883.175048828125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5072.849521636963, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4927.640323638916, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5139.209613800049, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5769.662113189697, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5161.816825866699, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5080.3424072265625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5285.80623626709, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6066.2321853637695, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5139.783153533936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4937.02672958374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5139.164962768555, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5752.380294799805, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5195.106887817383, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5128.765754699707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5317.273120880127, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6043.233585357666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5137.52067565918, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5034.200325012207, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5285.714225769043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5873.941116333008, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5220.712776184082, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5420.234508514404, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5621.2993240356445, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6132.96142578125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5686.548290252686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6098.449459075928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6154.724044799805, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5755.120105743408, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6574.627380371094, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6757.98210144043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6009.357624053955, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6320.3631591796875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6400.383834838867, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5999.11506652832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6616.667861938477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6775.768013000488, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6063.161392211914, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6345.622844696045, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6405.554294586182, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6012.612113952637, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6673.530235290527, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6786.916847229004, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6022.8545570373535, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6549.9711990356445, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6467.179985046387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6040.431880950928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6826.678466796875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6802.9462814331055, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8211.717071533203, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8330.993385314941, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8784.50366973877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9547.28042602539, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8966.997108459473, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9559.884376525879, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9305.77091217041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9317.547492980957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3478.3670234680176, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3544.8641777038574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3961.253433227539, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4073.9996910095215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3718.238410949707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3601.4801597595215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4304.349746704102, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4351.166572570801, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3526.5097427368164, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3548.5327911376953, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3757.4961853027344, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3938.906593322754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3788.4286499023438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3591.571846008301, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4310.843372344971, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4345.727672576904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3523.0323219299316, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3543.544807434082, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3777.9487800598145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3938.822555541992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3791.270046234131, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3602.6802825927734, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4314.800815582275, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4343.420448303223, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3523.5100746154785, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3520.556182861328, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3874.6030235290527, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3995.9201622009277, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3801.1135864257812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3535.406894683838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4327.811374664307, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4353.367824554443, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5194.7731590271, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4263.067665100098, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5988.117733001709, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4773.443756103516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5438.2037353515625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4118.177108764648, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6205.970993041992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4755.063171386719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5552.203845977783, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4125.783958435059, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6250.855541229248, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4761.8889808654785, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5633.015022277832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4221.579170227051, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6293.305606842041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4766.275691986084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2974.7632026672363, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3186.078233718872, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3173.4646320343018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3025.1212787628174, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3045.949754714966, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3118.4873485565186, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2880.361089706421, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2952.5241661071777, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2902.7505493164062, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2957.41117477417, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2950.740785598755, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2878.1841373443604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2844.810085296631, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2960.7401275634766, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2889.6886253356934, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2936.181116104126, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2960.477924346924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2851.6566467285156, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2838.7814331054688, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2989.2156887054443, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2934.9668979644775, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2933.474063873291, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2988.5433769226074, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2862.9895973205566, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3929.7780990600586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3835.437545776367, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4017.5388717651367, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3886.0865592956543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4079.34799194336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3920.0889778137207, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4156.153869628906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3978.694896697998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3539.1539573669434, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3115.3745555877686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3549.407501220703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3143.4239864349365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3560.8947372436523, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3156.056480407715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3599.956169128418, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3183.416795730591, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4415.585784912109, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4497.762222290039, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4350.323390960693, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4609.195499420166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4456.599960327148, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4383.949565887451, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4244.342555999756, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4514.317760467529, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5021.20512008667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5195.3972816467285, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4943.681774139404, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4939.017105102539, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4927.8521728515625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5021.814212799072, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4760.08638381958, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4788.812675476074, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5046.364765167236, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5197.506885528564, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4991.850051879883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5463.424320220947, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4956.982421875, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5017.5568199157715, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4799.3110275268555, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5310.731639862061, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5067.37154006958, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5207.324466705322, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5093.074531555176, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5885.851535797119, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4984.56579208374, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5019.412002563477, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4914.679298400879, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5748.238277435303, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5819.990863800049, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5617.7463722229, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6663.072319030762, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6541.342887878418, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6692.149505615234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6444.217224121094, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6744.459991455078, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6478.198013305664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3130.5244636535645, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3111.168165206909, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3048.563995361328, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3070.095043182373, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2946.063823699951, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3023.13871383667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3300.2358436584473, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3238.816967010498, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3205.173749923706, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3213.7282943725586, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3135.974712371826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3085.6080055236816, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3300.726737976074, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3261.236152648926, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3342.7804565429688, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3205.916805267334, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3147.18656539917, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3214.2523288726807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3315.5582427978516, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3296.5883255004883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3389.9593544006348, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3216.247844696045, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3172.9556941986084, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3275.124931335449, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4271.883354187012, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3635.2961921691895, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4542.604808807373, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4024.113445281983, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4598.404808044434, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4030.421142578125, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4692.101268768311, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4060.154399871826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3375.795021057129, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2596.9678592681885, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2999.666872024536, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2274.325580596924, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3539.1600036621094, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2617.7446269989014, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3128.3199977874756, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2301.5169620513916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3572.500991821289, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2636.0851192474365, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3148.9195251464844, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2308.2815837860107, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3645.4359817504883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2671.486873626709, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3201.6777515411377, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2331.713285446167, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12129.61498260498, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2472.450065612793, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12021.38786315918, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2516.5416049957275, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12109.438362121582, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2543.803997039795, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12266.20891571045, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2573.5532760620117, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5870.174865722656, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4383.220615386963, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5940.574893951416, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4443.357467651367, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6369.368801116943, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4693.510112762451, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6454.839515686035, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4678.781585693359, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6425.933494567871, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4726.06876373291, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6507.122688293457, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4708.105945587158, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6576.025466918945, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4785.804138183594, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6655.387382507324, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4762.782192230225, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5467.3956871032715, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3689.639949798584, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5539.336166381836, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3981.846866607666, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5637.489109039307, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4012.6147460937495, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5972.575836181641, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4119.997615814209, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 21874.033203125, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2898.939847946167, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 21101.767578125, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2996.2156677246094, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 21193.941802978516, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3030.837278366089, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 21327.536239624023, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3117.4169731140137, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
diff --git a/tuning_0.log b/tuning_0.log
deleted file mode 100644
index 9737a859d..000000000
--- a/tuning_0.log
+++ /dev/null
@@ -1,113327 +0,0 @@
-INFO 07-23 11:40:53 [__init__.py:235] Automatically detected platform cuda.
-Namespace(input_len=64, output_len=1, batch_size=1, n=1, use_beam_search=False, num_iters_warmup=3, num_iters=3, profile=False, output_json='/home/zrlngl/watsonx/zrl-triton-results-and-notebooks/vllm_benchmarks_latency/-net-storage149-autofs-css22-nmg-models-cos-1bfc857-fmaas-integration-tests-models-granite-4_0-small-base-pipecleaner-hf/NVIDIA_H100_80GB_HBM3/tuning_ignore/exp_2025-07-23_1140//result_bs_1_il_64_ol_1.json', disable_detokenize=False, model='/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf', task='auto', tokenizer=None, tokenizer_mode='auto', trust_remote_code=False, dtype='auto', seed=None, hf_config_path=None, allowed_local_media_path='', revision=None, code_revision=None, rope_scaling={}, rope_theta=None, tokenizer_revision=None, max_model_len=None, quantization=None, enforce_eager=False, max_seq_len_to_capture=8192, max_logprobs=20, logprobs_mode='raw_logprobs', disable_sliding_window=False, disable_cascade_attn=False, skip_tokenizer_init=False, enable_prompt_embeds=False, served_model_name=None, disable_async_output_proc=False, config_format='auto', hf_token=None, hf_overrides={}, override_neuron_config={}, override_pooler_config=None, logits_processor_pattern=None, generation_config='auto', override_generation_config={}, enable_sleep_mode=False, model_impl='auto', override_attention_dtype=None, load_format='auto', download_dir=None, model_loader_extra_config={}, ignore_patterns=None, use_tqdm_on_load=True, pt_load_map_location='cpu', guided_decoding_backend='auto', guided_decoding_disable_fallback=False, guided_decoding_disable_any_whitespace=False, guided_decoding_disable_additional_properties=False, reasoning_parser='', distributed_executor_backend=None, pipeline_parallel_size=1, tensor_parallel_size=1, data_parallel_size=1, data_parallel_rank=None, data_parallel_size_local=None, data_parallel_address=None, data_parallel_rpc_port=None, data_parallel_backend='mp', enable_expert_parallel=False, enable_eplb=False, num_redundant_experts=0, eplb_window_size=1000, eplb_step_interval=3000, eplb_log_balancedness=False, max_parallel_loading_workers=None, ray_workers_use_nsight=False, disable_custom_all_reduce=False, worker_cls='auto', worker_extension_cls='', enable_multimodal_encoder_data_parallel=False, block_size=None, gpu_memory_utilization=0.9, swap_space=4, kv_cache_dtype='auto', num_gpu_blocks_override=None, enable_prefix_caching=False, prefix_caching_hash_algo='builtin', cpu_offload_gb=0, calculate_kv_scales=False, limit_mm_per_prompt={}, media_io_kwargs={}, mm_processor_kwargs=None, disable_mm_preprocessor_cache=False, interleave_mm_strings=False, enable_lora=None, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', max_cpu_loras=None, fully_sharded_loras=False, default_mm_loras=None, enable_prompt_adapter=None, max_prompt_adapters=1, max_prompt_adapter_token=0, speculative_config=None, show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, max_num_batched_tokens=None, max_num_seqs=None, max_num_partial_prefills=1, max_long_partial_prefills=1, cuda_graph_sizes=[], long_prefill_token_threshold=0, num_lookahead_slots=0, scheduler_delay_factor=0.0, preemption_mode=None, num_scheduler_steps=1, multi_step_stream_outputs=True, scheduling_policy='fcfs', enable_chunked_prefill=None, disable_chunked_mm_input=False, scheduler_cls='vllm.core.scheduler.Scheduler', disable_hybrid_kv_cache_manager=False, async_scheduling=False, kv_transfer_config=None, kv_events_config=None, compilation_config={"level":0,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":[],"use_inductor":true,"compile_sizes":null,"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":0,"cudagraph_capture_sizes":null,"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":null,"local_cache_dir":null}, additional_config={}, disable_log_stats=False)
-ERROR 07-23 11:41:01 [config.py:133] Error retrieving safetensors: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf'. Use `repo_type` argument if needed., retrying 1 of 2
-ERROR 07-23 11:41:03 [config.py:131] Error retrieving safetensors: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf'. Use `repo_type` argument if needed.
-INFO 07-23 11:41:03 [config.py:3483] Downcasting torch.float32 to torch.bfloat16.
-INFO 07-23 11:41:03 [config.py:1602] Using max model len 132096
-WARNING 07-23 11:41:03 [arg_utils.py:1684] Detected VLLM_USE_V1=1 with Mamba. Usage should be considered experimental. Please report any issues on Github.
-INFO 07-23 11:41:03 [config.py:2424] Chunked prefill is enabled with max_num_batched_tokens=16384.
-INFO 07-23 11:41:03 [config.py:214] Setting max_seq_len_to_capture to 132096 to ensure that CUDA graph capture covers sequences of length up to max_model_len.
-[triton-dejavu] generated 75 configurations out of ConfigSpace: BLOCK_SIZE_M: [4, 8, 16, 32, 64], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 4, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _bmm_chunk_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _chunk_scan_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] restored 1 configurations for _chunk_cumsum_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _chunk_state_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _chunk_state_varlen_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] generated 168 configurations out of ConfigSpace: BLOCK_SIZE: [32, 64, 128, 256, 512, 1024, 2048, 4096], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default.
-INFO 07-23 11:41:05 [config.py:279] Setting attention block size to 528 tokens to ensure that attention page size is >= mamba page size.
-INFO 07-23 11:41:05 [config.py:300] Padding mamba page size by 0.69% to ensure that mamba page size and attention page size are exactly equal.
-WARNING 07-23 11:41:05 [__init__.py:2904] We must use the `spawn` multiprocessing start method. Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. Reason: CUDA is initialized
-INFO 07-23 11:41:09 [__init__.py:235] Automatically detected platform cuda.
-INFO 07-23 11:41:10 [core.py:553] Waiting for init message from front-end.
-INFO 07-23 11:41:10 [core.py:71] Initializing a V1 LLM engine (v0.1.dev7919+g84c7525) with config: model='/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf', speculative_config=None, tokenizer='/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=132096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":512,"local_cache_dir":null}
-[triton-dejavu] generated 75 configurations out of ConfigSpace: BLOCK_SIZE_M: [4, 8, 16, 32, 64], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 4, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _bmm_chunk_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _chunk_scan_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] restored 1 configurations for _chunk_cumsum_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _chunk_state_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _chunk_state_varlen_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] generated 168 configurations out of ConfigSpace: BLOCK_SIZE: [32, 64, 128, 256, 512, 1024, 2048, 4096], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default.
-INFO 07-23 11:41:12 [parallel_state.py:1102] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
-INFO 07-23 11:41:12 [topk_topp_sampler.py:49] Using FlashInfer for top-p & top-k sampling.
-INFO 07-23 11:41:12 [gpu_model_runner.py:1793] Starting to load model /net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf...
-INFO 07-23 11:41:12 [gpu_model_runner.py:1826] Loading model from scratch...
-INFO 07-23 11:41:12 [cuda.py:246] Using FlashInfer backend on V1 engine.
-INFO 07-23 11:41:24 [default_loader.py:262] Loading weights took 11.59 seconds
-INFO 07-23 11:41:24 [gpu_model_runner.py:1850] Model loading took 60.0260 GiB and 11.687507 seconds
-INFO 07-23 11:41:27 [backends.py:530] Using cache directory: /home/zrlngl/.cache/vllm/torch_compile_cache/9bcd1b9f98/rank_0_0/backbone for vLLM's torch.compile
-INFO 07-23 11:41:27 [backends.py:541] Dynamo bytecode transform time: 2.60 s
-INFO 07-23 11:41:29 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 1.788 s
-INFO 07-23 11:41:29 [fused_moe.py:688] Using configuration from /home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json for MoE layer.
-INFO 07-23 11:41:29 [monitor.py:34] torch.compile takes 2.60 s in total
-INFO 07-23 11:41:30 [gpu_worker.py:245] Available KV cache memory: 8.99 GiB
-INFO 07-23 11:41:31 [kv_cache_utils.py:997] GPU KV cache size: 58,608 tokens
-INFO 07-23 11:41:31 [kv_cache_utils.py:1001] Maximum concurrency for 132,096 tokens per request: 4.29x
-INFO 07-23 11:41:51 [gpu_model_runner.py:2395] Graph capturing finished in 20 secs, took 0.93 GiB
-INFO 07-23 11:41:51 [core.py:193] init engine (profile, create kv cache, warmup model) took 26.97 seconds
-INFO 07-23 11:41:51 [config.py:214] Setting max_seq_len_to_capture to 132096 to ensure that CUDA graph capture covers sequences of length up to max_model_len.
-SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=True, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None)
-Warming up...
-[triton-dejavu] ('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32') not in cache, starting to tune...
-[triton-dejavu] [2025-07-23 11:41:51]  Started benchmarking of 2625 configurations... (use_bo: False, run: 0)
-[triton-dejavu] First execution including JIT compilation took 0.003792285919189453s.
-[triton-dejavu] First execution including JIT compilation took 0.0024290084838867188s.
-[triton-dejavu] First execution including JIT compilation took 0.002718687057495117s.
-[triton-dejavu] First execution including JIT compilation took 0.0024924278259277344s.
-[triton-dejavu] First execution including JIT compilation took 0.002490520477294922s.
-[triton-dejavu] First execution including JIT compilation took 0.002441883087158203s.
-[triton-dejavu] First execution including JIT compilation took 0.00249481201171875s.
-[triton-dejavu] First execution including JIT compilation took 0.0025339126586914062s.
-[triton-dejavu] First execution including JIT compilation took 0.0025014877319335938s.
-[triton-dejavu] First execution including JIT compilation took 0.0037887096405029297s.
-[triton-dejavu] First execution including JIT compilation took 0.0025954246520996094s.
-[triton-dejavu] First execution including JIT compilation took 0.026740074157714844s.
-[triton-dejavu] First execution including JIT compilation took 0.0025556087493896484s.
-[triton-dejavu] First execution including JIT compilation took 0.002724885940551758s.
-[triton-dejavu] First execution including JIT compilation took 0.002470731735229492s.
-[triton-dejavu] First execution including JIT compilation took 0.0026400089263916016s.
-[triton-dejavu] First execution including JIT compilation took 0.00249481201171875s.
-[triton-dejavu] First execution including JIT compilation took 0.0030193328857421875s.
-[triton-dejavu] First execution including JIT compilation took 0.0026063919067382812s.
-[triton-dejavu] First execution including JIT compilation took 0.002532482147216797s.
-[triton-dejavu] First execution including JIT compilation took 0.0024771690368652344s.
-[triton-dejavu] First execution including JIT compilation took 0.0025453567504882812s.
-[triton-dejavu] First execution including JIT compilation took 0.0025365352630615234s.
-[triton-dejavu] First execution including JIT compilation took 0.002547740936279297s.
-[triton-dejavu] First execution including JIT compilation took 0.0024492740631103516s.
-[triton-dejavu] First execution including JIT compilation took 0.002447366714477539s.
-[triton-dejavu] First execution including JIT compilation took 0.002489328384399414s.
-[triton-dejavu] First execution including JIT compilation took 0.0026972293853759766s.
-[triton-dejavu] First execution including JIT compilation took 0.002499103546142578s.
-[triton-dejavu] First execution including JIT compilation took 0.002460479736328125s.
-[triton-dejavu] First execution including JIT compilation took 0.0026140213012695312s.
-[triton-dejavu] First execution including JIT compilation took 0.0025260448455810547s.
-[triton-dejavu] First execution including JIT compilation took 0.002526998519897461s.
-[triton-dejavu] First execution including JIT compilation took 0.002600431442260742s.
-[triton-dejavu] First execution including JIT compilation took 0.002473115921020508s.
-[triton-dejavu] First execution including JIT compilation took 0.0025148391723632812s.
-[triton-dejavu] First execution including JIT compilation took 0.002527475357055664s.
-[triton-dejavu] First execution including JIT compilation took 0.0025048255920410156s.
-[triton-dejavu] First execution including JIT compilation took 0.0025298595428466797s.
-[triton-dejavu] First execution including JIT compilation took 0.0027625560760498047s.
-[triton-dejavu] First execution including JIT compilation took 0.0024726390838623047s.
-[triton-dejavu] First execution including JIT compilation took 0.02015542984008789s.
-[triton-dejavu] First execution including JIT compilation took 0.0024602413177490234s.
-[triton-dejavu] First execution including JIT compilation took 0.00244903564453125s.
-[triton-dejavu] First execution including JIT compilation took 0.0025997161865234375s.
-[triton-dejavu] First execution including JIT compilation took 0.002533435821533203s.
-[triton-dejavu] First execution including JIT compilation took 0.0026590824127197266s.
-[triton-dejavu] First execution including JIT compilation took 0.002520322799682617s.
-[triton-dejavu] First execution including JIT compilation took 0.0027184486389160156s.
-[triton-dejavu] First execution including JIT compilation took 0.002468109130859375s.
-[triton-dejavu] First execution including JIT compilation took 0.002460479736328125s.
-[triton-dejavu] First execution including JIT compilation took 0.002536773681640625s.
-[triton-dejavu] First execution including JIT compilation took 0.0024209022521972656s.
-[triton-dejavu] First execution including JIT compilation took 0.0024750232696533203s.
-[triton-dejavu] First execution including JIT compilation took 0.002548694610595703s.
-[triton-dejavu] First execution including JIT compilation took 0.002722024917602539s.
-[triton-dejavu] First execution including JIT compilation took 0.0025467872619628906s.
-[triton-dejavu] First execution including JIT compilation took 0.0026938915252685547s.
-[triton-dejavu] First execution including JIT compilation took 0.002567768096923828s.
-[triton-dejavu] First execution including JIT compilation took 0.0025353431701660156s.
-[triton-dejavu] First execution including JIT compilation took 0.36343860626220703s.
-[triton-dejavu] First execution including JIT compilation took 0.2895364761352539s.
-[triton-dejavu] First execution including JIT compilation took 0.28336596488952637s.
-[triton-dejavu] First execution including JIT compilation took 0.24683427810668945s.
-[triton-dejavu] First execution including JIT compilation took 0.23354220390319824s.
-[triton-dejavu] First execution including JIT compilation took 0.2026069164276123s.
-[triton-dejavu] First execution including JIT compilation took 0.361889123916626s.
-[triton-dejavu] First execution including JIT compilation took 0.32724452018737793s.
-[triton-dejavu] First execution including JIT compilation took 0.2289412021636963s.
-[triton-dejavu] First execution including JIT compilation took 0.3778233528137207s.
-[triton-dejavu] First execution including JIT compilation took 0.35303163528442383s.
-[triton-dejavu] First execution including JIT compilation took 0.25978708267211914s.
-[triton-dejavu] First execution including JIT compilation took 0.4000725746154785s.
-[triton-dejavu] First execution including JIT compilation took 0.37045931816101074s.
-[triton-dejavu] First execution including JIT compilation took 0.2568087577819824s.
-[triton-dejavu] First execution including JIT compilation took 0.4205307960510254s.
-[triton-dejavu] First execution including JIT compilation took 0.3958923816680908s.
-[triton-dejavu] First execution including JIT compilation took 0.27231621742248535s.
-[triton-dejavu] First execution including JIT compilation took 0.4481041431427002s.
-[triton-dejavu] First execution including JIT compilation took 0.34272170066833496s.
-[triton-dejavu] First execution including JIT compilation took 0.23339176177978516s.
-[triton-dejavu] First execution including JIT compilation took 0.39439821243286133s.
-[triton-dejavu] First execution including JIT compilation took 0.3556709289550781s.
-[triton-dejavu] First execution including JIT compilation took 0.2575538158416748s.
-[triton-dejavu] First execution including JIT compilation took 0.3856210708618164s.
-[triton-dejavu] First execution including JIT compilation took 0.29673099517822266s.
-[triton-dejavu] First execution including JIT compilation took 0.2755117416381836s.
-[triton-dejavu] First execution including JIT compilation took 0.5427765846252441s.
-[triton-dejavu] First execution including JIT compilation took 0.4995570182800293s.
-[triton-dejavu] First execution including JIT compilation took 0.4783041477203369s.
-[triton-dejavu] First execution including JIT compilation took 0.631375789642334s.
-[triton-dejavu] First execution including JIT compilation took 0.5344967842102051s.
-[triton-dejavu] First execution including JIT compilation took 0.4600377082824707s.
-[triton-dejavu] First execution including JIT compilation took 0.4879109859466553s.
-[triton-dejavu] First execution including JIT compilation took 0.47121238708496094s.
-[triton-dejavu] First execution including JIT compilation took 0.41219019889831543s.
-[triton-dejavu] First execution including JIT compilation took 0.6414506435394287s.
-[triton-dejavu] First execution including JIT compilation took 0.5922412872314453s.
-[triton-dejavu] First execution including JIT compilation took 0.5718593597412109s.
-[triton-dejavu] First execution including JIT compilation took 0.7388913631439209s.
-[triton-dejavu] First execution including JIT compilation took 0.6281144618988037s.
-[triton-dejavu] First execution including JIT compilation took 0.5711205005645752s.
-[triton-dejavu] First execution including JIT compilation took 0.9001131057739258s.
-[triton-dejavu] First execution including JIT compilation took 0.681952953338623s.
-[triton-dejavu] First execution including JIT compilation took 0.628960132598877s.
-[triton-dejavu] First execution including JIT compilation took 0.19681072235107422s.
-[triton-dejavu] First execution including JIT compilation took 0.1845228672027588s.
-[triton-dejavu] First execution including JIT compilation took 0.18219757080078125s.
-[triton-dejavu] First execution including JIT compilation took 0.218735933303833s.
-[triton-dejavu] First execution including JIT compilation took 0.20905685424804688s.
-[triton-dejavu] First execution including JIT compilation took 0.21239614486694336s.
-[triton-dejavu] First execution including JIT compilation took 0.2337355613708496s.
-[triton-dejavu] First execution including JIT compilation took 0.22266483306884766s.
-[triton-dejavu] First execution including JIT compilation took 0.21049857139587402s.
-[triton-dejavu] First execution including JIT compilation took 0.2469940185546875s.
-[triton-dejavu] First execution including JIT compilation took 0.24105095863342285s.
-[triton-dejavu] First execution including JIT compilation took 0.22619390487670898s.
-[triton-dejavu] First execution including JIT compilation took 0.25737953186035156s.
-[triton-dejavu] First execution including JIT compilation took 0.24932122230529785s.
-[triton-dejavu] First execution including JIT compilation took 0.2292931079864502s.
-[triton-dejavu] First execution including JIT compilation took 0.2662630081176758s.
-[triton-dejavu] First execution including JIT compilation took 0.25505638122558594s.
-[triton-dejavu] First execution including JIT compilation took 0.23747634887695312s.
-[triton-dejavu] First execution including JIT compilation took 0.2888965606689453s.
-[triton-dejavu] First execution including JIT compilation took 0.27660059928894043s.
-[triton-dejavu] First execution including JIT compilation took 0.2541189193725586s.
-[triton-dejavu] First execution including JIT compilation took 0.21480035781860352s.
-[triton-dejavu] First execution including JIT compilation took 0.1914529800415039s.
-[triton-dejavu] First execution including JIT compilation took 0.18795394897460938s.
-[triton-dejavu] First execution including JIT compilation took 0.24895811080932617s.
-[triton-dejavu] First execution including JIT compilation took 0.216827392578125s.
-[triton-dejavu] First execution including JIT compilation took 0.21476054191589355s.
-[triton-dejavu] First execution including JIT compilation took 0.26205873489379883s.
-[triton-dejavu] First execution including JIT compilation took 0.23574447631835938s.
-[triton-dejavu] First execution including JIT compilation took 0.22771525382995605s.
-[triton-dejavu] First execution including JIT compilation took 0.2744171619415283s.
-[triton-dejavu] First execution including JIT compilation took 0.24593615531921387s.
-[triton-dejavu] First execution including JIT compilation took 0.25572872161865234s.
-[triton-dejavu] First execution including JIT compilation took 0.2906627655029297s.
-[triton-dejavu] First execution including JIT compilation took 0.2578747272491455s.
-[triton-dejavu] First execution including JIT compilation took 0.2551157474517822s.
-[triton-dejavu] First execution including JIT compilation took 0.307572603225708s.
-[triton-dejavu] First execution including JIT compilation took 0.2733023166656494s.
-[triton-dejavu] First execution including JIT compilation took 0.2657465934753418s.
-[triton-dejavu] First execution including JIT compilation took 0.34859251976013184s.
-[triton-dejavu] First execution including JIT compilation took 0.2858898639678955s.
-[triton-dejavu] First execution including JIT compilation took 0.2817537784576416s.
-[triton-dejavu] First execution including JIT compilation took 0.24785876274108887s.
-[triton-dejavu] First execution including JIT compilation took 0.214003324508667s.
-[triton-dejavu] First execution including JIT compilation took 0.20161771774291992s.
-[triton-dejavu] First execution including JIT compilation took 0.30726170539855957s.
-[triton-dejavu] First execution including JIT compilation took 0.2544825077056885s.
-[triton-dejavu] First execution including JIT compilation took 0.22570061683654785s.
-[triton-dejavu] First execution including JIT compilation took 0.3320579528808594s.
-[triton-dejavu] First execution including JIT compilation took 0.2685830593109131s.
-[triton-dejavu] First execution including JIT compilation took 0.23553252220153809s.
-[triton-dejavu] First execution including JIT compilation took 0.34238600730895996s.
-[triton-dejavu] First execution including JIT compilation took 0.2860074043273926s.
-[triton-dejavu] First execution including JIT compilation took 0.24680185317993164s.
-[triton-dejavu] First execution including JIT compilation took 0.3659553527832031s.
-[triton-dejavu] First execution including JIT compilation took 0.2950880527496338s.
-[triton-dejavu] First execution including JIT compilation took 0.2600231170654297s.
-[triton-dejavu] First execution including JIT compilation took 0.38948678970336914s.
-[triton-dejavu] First execution including JIT compilation took 0.3203599452972412s.
-[triton-dejavu] First execution including JIT compilation took 0.2689199447631836s.
-[triton-dejavu] First execution including JIT compilation took 0.42819809913635254s.
-[triton-dejavu] First execution including JIT compilation took 0.3495504856109619s.
-[triton-dejavu] First execution including JIT compilation took 0.2916533946990967s.
-[triton-dejavu] First execution including JIT compilation took 0.3085203170776367s.
-[triton-dejavu] First execution including JIT compilation took 0.26044535636901855s.
-[triton-dejavu] First execution including JIT compilation took 0.24263620376586914s.
-[triton-dejavu] First execution including JIT compilation took 0.4157595634460449s.
-[triton-dejavu] First execution including JIT compilation took 0.358691930770874s.
-[triton-dejavu] First execution including JIT compilation took 0.2635207176208496s.
-[triton-dejavu] First execution including JIT compilation took 0.4522533416748047s.
-[triton-dejavu] First execution including JIT compilation took 0.39212489128112793s.
-[triton-dejavu] First execution including JIT compilation took 0.27906370162963867s.
-[triton-dejavu] First execution including JIT compilation took 0.5023193359375s.
-[triton-dejavu] First execution including JIT compilation took 0.4139993190765381s.
-[triton-dejavu] First execution including JIT compilation took 0.300579309463501s.
-[triton-dejavu] First execution including JIT compilation took 0.529712438583374s.
-[triton-dejavu] First execution including JIT compilation took 0.43097352981567383s.
-[triton-dejavu] First execution including JIT compilation took 0.3235909938812256s.
-[triton-dejavu] First execution including JIT compilation took 0.5673091411590576s.
-[triton-dejavu] First execution including JIT compilation took 0.4577775001525879s.
-[triton-dejavu] First execution including JIT compilation took 0.34275031089782715s.
-[triton-dejavu] First execution including JIT compilation took 0.6338176727294922s.
-[triton-dejavu] First execution including JIT compilation took 0.49797868728637695s.
-[triton-dejavu] First execution including JIT compilation took 0.37319207191467285s.
-[triton-dejavu] First execution including JIT compilation took 0.47208404541015625s.
-[triton-dejavu] First execution including JIT compilation took 0.33609509468078613s.
-[triton-dejavu] First execution including JIT compilation took 0.30238795280456543s.
-[triton-dejavu] First execution including JIT compilation took 0.7173871994018555s.
-[triton-dejavu] First execution including JIT compilation took 0.5442206859588623s.
-[triton-dejavu] First execution including JIT compilation took 0.511505126953125s.
-[triton-dejavu] First execution including JIT compilation took 0.8165128231048584s.
-[triton-dejavu] First execution including JIT compilation took 0.583181619644165s.
-[triton-dejavu] First execution including JIT compilation took 0.5342910289764404s.
-[triton-dejavu] First execution including JIT compilation took 0.9290053844451904s.
-[triton-dejavu] First execution including JIT compilation took 0.6662936210632324s.
-[triton-dejavu] First execution including JIT compilation took 0.5730347633361816s.
-[triton-dejavu] First execution including JIT compilation took 0.9860448837280273s.
-[triton-dejavu] First execution including JIT compilation took 0.7075350284576416s.
-[triton-dejavu] First execution including JIT compilation took 0.593177080154419s.
-[triton-dejavu] First execution including JIT compilation took 1.0206115245819092s.
-[triton-dejavu] First execution including JIT compilation took 0.6365783214569092s.
-[triton-dejavu] First execution including JIT compilation took 0.5057172775268555s.
-[triton-dejavu] First execution including JIT compilation took 0.9307384490966797s.
-[triton-dejavu] First execution including JIT compilation took 0.6267914772033691s.
-[triton-dejavu] First execution including JIT compilation took 0.5471899509429932s.
-[triton-dejavu] First execution including JIT compilation took 0.1969449520111084s.
-[triton-dejavu] First execution including JIT compilation took 0.1712944507598877s.
-[triton-dejavu] First execution including JIT compilation took 0.1570436954498291s.
-[triton-dejavu] First execution including JIT compilation took 0.2668495178222656s.
-[triton-dejavu] First execution including JIT compilation took 0.2014913558959961s.
-[triton-dejavu] First execution including JIT compilation took 0.17658185958862305s.
-[triton-dejavu] First execution including JIT compilation took 0.24475407600402832s.
-[triton-dejavu] First execution including JIT compilation took 0.20715713500976562s.
-[triton-dejavu] First execution including JIT compilation took 0.18568778038024902s.
-[triton-dejavu] First execution including JIT compilation took 0.2623903751373291s.
-[triton-dejavu] First execution including JIT compilation took 0.20713019371032715s.
-[triton-dejavu] First execution including JIT compilation took 0.17886114120483398s.
-[triton-dejavu] First execution including JIT compilation took 0.2510707378387451s.
-[triton-dejavu] First execution including JIT compilation took 0.21624040603637695s.
-[triton-dejavu] First execution including JIT compilation took 0.20012712478637695s.
-[triton-dejavu] First execution including JIT compilation took 0.2688755989074707s.
-[triton-dejavu] First execution including JIT compilation took 0.2036607265472412s.
-[triton-dejavu] First execution including JIT compilation took 0.21555137634277344s.
-[triton-dejavu] First execution including JIT compilation took 0.272658109664917s.
-[triton-dejavu] First execution including JIT compilation took 0.2918975353240967s.
-[triton-dejavu] First execution including JIT compilation took 0.22692298889160156s.
-[triton-dejavu] First execution including JIT compilation took 0.20147228240966797s.
-[triton-dejavu] First execution including JIT compilation took 0.17040586471557617s.
-[triton-dejavu] First execution including JIT compilation took 0.1717395782470703s.
-[triton-dejavu] First execution including JIT compilation took 0.2556333541870117s.
-[triton-dejavu] First execution including JIT compilation took 0.19439339637756348s.
-[triton-dejavu] First execution including JIT compilation took 0.18878650665283203s.
-[triton-dejavu] First execution including JIT compilation took 0.28153157234191895s.
-[triton-dejavu] First execution including JIT compilation took 0.22823047637939453s.
-[triton-dejavu] First execution including JIT compilation took 0.20015215873718262s.
-[triton-dejavu] First execution including JIT compilation took 0.2893240451812744s.
-[triton-dejavu] First execution including JIT compilation took 0.2234363555908203s.
-[triton-dejavu] First execution including JIT compilation took 0.20252442359924316s.
-[triton-dejavu] First execution including JIT compilation took 0.29529833793640137s.
-[triton-dejavu] First execution including JIT compilation took 0.25741052627563477s.
-[triton-dejavu] First execution including JIT compilation took 0.22293853759765625s.
-[triton-dejavu] First execution including JIT compilation took 0.32663512229919434s.
-[triton-dejavu] First execution including JIT compilation took 0.257922887802124s.
-[triton-dejavu] First execution including JIT compilation took 0.2501180171966553s.
-[triton-dejavu] First execution including JIT compilation took 0.3506193161010742s.
-[triton-dejavu] First execution including JIT compilation took 0.272749662399292s.
-[triton-dejavu] First execution including JIT compilation took 0.243269681930542s.
-[triton-dejavu] First execution including JIT compilation took 0.2345433235168457s.
-[triton-dejavu] First execution including JIT compilation took 0.21488571166992188s.
-[triton-dejavu] First execution including JIT compilation took 0.18851923942565918s.
-[triton-dejavu] First execution including JIT compilation took 0.29990649223327637s.
-[triton-dejavu] First execution including JIT compilation took 0.2599034309387207s.
-[triton-dejavu] First execution including JIT compilation took 0.21689176559448242s.
-[triton-dejavu] First execution including JIT compilation took 0.361560583114624s.
-[triton-dejavu] First execution including JIT compilation took 0.27080440521240234s.
-[triton-dejavu] First execution including JIT compilation took 0.22725224494934082s.
-[triton-dejavu] First execution including JIT compilation took 0.34572768211364746s.
-[triton-dejavu] First execution including JIT compilation took 0.2681708335876465s.
-[triton-dejavu] First execution including JIT compilation took 0.22074484825134277s.
-[triton-dejavu] First execution including JIT compilation took 0.3579220771789551s.
-[triton-dejavu] First execution including JIT compilation took 0.2913625240325928s.
-[triton-dejavu] First execution including JIT compilation took 0.27397990226745605s.
-[triton-dejavu] First execution including JIT compilation took 0.36322855949401855s.
-[triton-dejavu] First execution including JIT compilation took 0.36347508430480957s.
-[triton-dejavu] First execution including JIT compilation took 0.2753303050994873s.
-[triton-dejavu] First execution including JIT compilation took 0.4066603183746338s.
-[triton-dejavu] First execution including JIT compilation took 0.4136660099029541s.
-[triton-dejavu] First execution including JIT compilation took 0.29329895973205566s.
-[triton-dejavu] First execution including JIT compilation took 0.37958860397338867s.
-[triton-dejavu] First execution including JIT compilation took 0.24896860122680664s.
-[triton-dejavu] First execution including JIT compilation took 0.21965575218200684s.
-[triton-dejavu] First execution including JIT compilation took 0.4879426956176758s.
-[triton-dejavu] First execution including JIT compilation took 0.33871960639953613s.
-[triton-dejavu] First execution including JIT compilation took 0.24471020698547363s.
-[triton-dejavu] First execution including JIT compilation took 0.4965670108795166s.
-[triton-dejavu] First execution including JIT compilation took 0.40749454498291016s.
-[triton-dejavu] First execution including JIT compilation took 0.2844102382659912s.
-[triton-dejavu] First execution including JIT compilation took 0.6162877082824707s.
-[triton-dejavu] First execution including JIT compilation took 0.37363123893737793s.
-[triton-dejavu] First execution including JIT compilation took 0.30038881301879883s.
-[triton-dejavu] First execution including JIT compilation took 0.6782312393188477s.
-[triton-dejavu] First execution including JIT compilation took 0.395599365234375s.
-[triton-dejavu] First execution including JIT compilation took 0.31715917587280273s.
-[triton-dejavu] First execution including JIT compilation took 0.6199126243591309s.
-[triton-dejavu] First execution including JIT compilation took 0.4322071075439453s.
-[triton-dejavu] First execution including JIT compilation took 0.3455088138580322s.
-[triton-dejavu] First execution including JIT compilation took 0.8280949592590332s.
-[triton-dejavu] First execution including JIT compilation took 0.5218696594238281s.
-[triton-dejavu] First execution including JIT compilation took 0.36759161949157715s.
-[triton-dejavu] First execution including JIT compilation took 0.601407527923584s.
-[triton-dejavu] First execution including JIT compilation took 0.36752843856811523s.
-[triton-dejavu] First execution including JIT compilation took 0.273007869720459s.
-[triton-dejavu] First execution including JIT compilation took 0.8815312385559082s.
-[triton-dejavu] First execution including JIT compilation took 0.5408468246459961s.
-[triton-dejavu] First execution including JIT compilation took 0.4321472644805908s.
-[triton-dejavu] First execution including JIT compilation took 1.340597152709961s.
-[triton-dejavu] First execution including JIT compilation took 0.6468391418457031s.
-[triton-dejavu] First execution including JIT compilation took 0.4674386978149414s.
-[triton-dejavu] First execution including JIT compilation took 1.4745817184448242s.
-[triton-dejavu] First execution including JIT compilation took 0.7319414615631104s.
-[triton-dejavu] First execution including JIT compilation took 0.4820535182952881s.
-[triton-dejavu] First execution including JIT compilation took 1.6843111515045166s.
-[triton-dejavu] First execution including JIT compilation took 0.7272007465362549s.
-[triton-dejavu] First execution including JIT compilation took 0.5684032440185547s.
-[triton-dejavu] First execution including JIT compilation took 1.6687507629394531s.
-[triton-dejavu] First execution including JIT compilation took 0.7634897232055664s.
-[triton-dejavu] First execution including JIT compilation took 0.5958552360534668s.
-bench_cudagraph failed with out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.21318602561950684s.
-[triton-dejavu] First execution including JIT compilation took 0.21699094772338867s.
-[triton-dejavu] First execution including JIT compilation took 0.18067646026611328s.
-[triton-dejavu] First execution including JIT compilation took 0.2883434295654297s.
-[triton-dejavu] First execution including JIT compilation took 0.20682692527770996s.
-[triton-dejavu] First execution including JIT compilation took 0.19046735763549805s.
-[triton-dejavu] First execution including JIT compilation took 0.2932305335998535s.
-[triton-dejavu] First execution including JIT compilation took 0.21595001220703125s.
-[triton-dejavu] First execution including JIT compilation took 0.18168210983276367s.
-[triton-dejavu] First execution including JIT compilation took 0.32982873916625977s.
-[triton-dejavu] First execution including JIT compilation took 0.22777533531188965s.
-[triton-dejavu] First execution including JIT compilation took 0.2037661075592041s.
-[triton-dejavu] First execution including JIT compilation took 0.31747984886169434s.
-[triton-dejavu] First execution including JIT compilation took 0.28336167335510254s.
-[triton-dejavu] First execution including JIT compilation took 0.19774699211120605s.
-[triton-dejavu] First execution including JIT compilation took 0.3306758403778076s.
-[triton-dejavu] First execution including JIT compilation took 0.2724292278289795s.
-[triton-dejavu] First execution including JIT compilation took 0.22767090797424316s.
-[triton-dejavu] First execution including JIT compilation took 0.3717081546783447s.
-[triton-dejavu] First execution including JIT compilation took 0.2847135066986084s.
-[triton-dejavu] First execution including JIT compilation took 0.2544288635253906s.
-[triton-dejavu] First execution including JIT compilation took 0.2563972473144531s.
-[triton-dejavu] First execution including JIT compilation took 0.21262860298156738s.
-[triton-dejavu] First execution including JIT compilation took 0.2203054428100586s.
-[triton-dejavu] First execution including JIT compilation took 0.3555338382720947s.
-[triton-dejavu] First execution including JIT compilation took 0.25258374214172363s.
-[triton-dejavu] First execution including JIT compilation took 0.22145795822143555s.
-[triton-dejavu] First execution including JIT compilation took 0.39704275131225586s.
-[triton-dejavu] First execution including JIT compilation took 0.26523470878601074s.
-[triton-dejavu] First execution including JIT compilation took 0.21595096588134766s.
-[triton-dejavu] First execution including JIT compilation took 0.4347100257873535s.
-[triton-dejavu] First execution including JIT compilation took 0.29169178009033203s.
-[triton-dejavu] First execution including JIT compilation took 0.21956300735473633s.
-[triton-dejavu] First execution including JIT compilation took 0.4330458641052246s.
-[triton-dejavu] First execution including JIT compilation took 0.31913185119628906s.
-[triton-dejavu] First execution including JIT compilation took 0.2509474754333496s.
-[triton-dejavu] First execution including JIT compilation took 0.48702025413513184s.
-[triton-dejavu] First execution including JIT compilation took 0.32025718688964844s.
-[triton-dejavu] First execution including JIT compilation took 0.2625458240509033s.
-[triton-dejavu] First execution including JIT compilation took 0.551466703414917s.
-[triton-dejavu] First execution including JIT compilation took 0.37408924102783203s.
-[triton-dejavu] First execution including JIT compilation took 0.27136731147766113s.
-[triton-dejavu] First execution including JIT compilation took 0.33800768852233887s.
-[triton-dejavu] First execution including JIT compilation took 0.26560330390930176s.
-[triton-dejavu] First execution including JIT compilation took 0.20183563232421875s.
-[triton-dejavu] First execution including JIT compilation took 0.40157294273376465s.
-[triton-dejavu] First execution including JIT compilation took 0.3323667049407959s.
-[triton-dejavu] First execution including JIT compilation took 0.2476518154144287s.
-[triton-dejavu] First execution including JIT compilation took 0.5237414836883545s.
-[triton-dejavu] First execution including JIT compilation took 0.38099026679992676s.
-[triton-dejavu] First execution including JIT compilation took 0.25824856758117676s.
-[triton-dejavu] First execution including JIT compilation took 0.5798733234405518s.
-[triton-dejavu] First execution including JIT compilation took 0.4060328006744385s.
-[triton-dejavu] First execution including JIT compilation took 0.299180269241333s.
-[triton-dejavu] First execution including JIT compilation took 0.5705587863922119s.
-[triton-dejavu] First execution including JIT compilation took 0.43184709548950195s.
-[triton-dejavu] First execution including JIT compilation took 0.29991817474365234s.
-[triton-dejavu] First execution including JIT compilation took 0.5768892765045166s.
-[triton-dejavu] First execution including JIT compilation took 0.5104458332061768s.
-[triton-dejavu] First execution including JIT compilation took 0.36955881118774414s.
-[triton-dejavu] First execution including JIT compilation took 0.6489105224609375s.
-[triton-dejavu] First execution including JIT compilation took 0.5593419075012207s.
-[triton-dejavu] First execution including JIT compilation took 0.3752884864807129s.
-[triton-dejavu] First execution including JIT compilation took 0.6494286060333252s.
-[triton-dejavu] First execution including JIT compilation took 0.35906028747558594s.
-[triton-dejavu] First execution including JIT compilation took 0.2642378807067871s.
-[triton-dejavu] First execution including JIT compilation took 0.7536261081695557s.
-[triton-dejavu] First execution including JIT compilation took 0.4381115436553955s.
-[triton-dejavu] First execution including JIT compilation took 0.3165860176086426s.
-[triton-dejavu] First execution including JIT compilation took 1.265178918838501s.
-[triton-dejavu] First execution including JIT compilation took 0.5233526229858398s.
-[triton-dejavu] First execution including JIT compilation took 0.3574998378753662s.
-[triton-dejavu] First execution including JIT compilation took 1.300689697265625s.
-[triton-dejavu] First execution including JIT compilation took 0.6212594509124756s.
-[triton-dejavu] First execution including JIT compilation took 0.4114842414855957s.
-[triton-dejavu] First execution including JIT compilation took 1.3530914783477783s.
-[triton-dejavu] First execution including JIT compilation took 0.6589765548706055s.
-[triton-dejavu] First execution including JIT compilation took 0.4349792003631592s.
-[triton-dejavu] First execution including JIT compilation took 1.412661075592041s.
-[triton-dejavu] First execution including JIT compilation took 0.7154123783111572s.
-[triton-dejavu] First execution including JIT compilation took 0.5011796951293945s.
-bench_cudagraph failed with out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.3227477073669434s.
-[triton-dejavu] First execution including JIT compilation took 0.6530427932739258s.
-[triton-dejavu] First execution including JIT compilation took 0.35028672218322754s.
-[triton-dejavu] First execution including JIT compilation took 2.6119463443756104s.
-[triton-dejavu] First execution including JIT compilation took 0.884284257888794s.
-[triton-dejavu] First execution including JIT compilation took 0.5999755859375s.
-[triton-dejavu] First execution including JIT compilation took 6.0120015144348145s.
-[triton-dejavu] First execution including JIT compilation took 1.4350576400756836s.
-[triton-dejavu] First execution including JIT compilation took 0.6809098720550537s.
-[triton-dejavu] First execution including JIT compilation took 6.039306402206421s.
-[triton-dejavu] First execution including JIT compilation took 1.520536184310913s.
-[triton-dejavu] First execution including JIT compilation took 0.7370305061340332s.
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.29415297508239746s.
-[triton-dejavu] First execution including JIT compilation took 0.2148146629333496s.
-[triton-dejavu] First execution including JIT compilation took 0.20547103881835938s.
-[triton-dejavu] First execution including JIT compilation took 0.4116225242614746s.
-[triton-dejavu] First execution including JIT compilation took 0.27173876762390137s.
-[triton-dejavu] First execution including JIT compilation took 0.2047574520111084s.
-[triton-dejavu] First execution including JIT compilation took 0.4177091121673584s.
-[triton-dejavu] First execution including JIT compilation took 0.2812669277191162s.
-[triton-dejavu] First execution including JIT compilation took 0.25458741188049316s.
-[triton-dejavu] First execution including JIT compilation took 0.4544975757598877s.
-[triton-dejavu] First execution including JIT compilation took 0.30648016929626465s.
-[triton-dejavu] First execution including JIT compilation took 0.2235255241394043s.
-[triton-dejavu] First execution including JIT compilation took 0.47878551483154297s.
-[triton-dejavu] First execution including JIT compilation took 0.3247034549713135s.
-[triton-dejavu] First execution including JIT compilation took 0.2551548480987549s.
-[triton-dejavu] First execution including JIT compilation took 0.6796090602874756s.
-[triton-dejavu] First execution including JIT compilation took 0.3536677360534668s.
-[triton-dejavu] First execution including JIT compilation took 0.2720470428466797s.
-[triton-dejavu] First execution including JIT compilation took 0.8124823570251465s.
-[triton-dejavu] First execution including JIT compilation took 0.4978444576263428s.
-[triton-dejavu] First execution including JIT compilation took 0.35080695152282715s.
-[triton-dejavu] First execution including JIT compilation took 0.5138082504272461s.
-[triton-dejavu] First execution including JIT compilation took 0.3385753631591797s.
-[triton-dejavu] First execution including JIT compilation took 0.2594444751739502s.
-[triton-dejavu] First execution including JIT compilation took 0.6842827796936035s.
-[triton-dejavu] First execution including JIT compilation took 0.4294295310974121s.
-[triton-dejavu] First execution including JIT compilation took 0.3218364715576172s.
-[triton-dejavu] First execution including JIT compilation took 0.8480286598205566s.
-[triton-dejavu] First execution including JIT compilation took 0.472670316696167s.
-[triton-dejavu] First execution including JIT compilation took 0.36148762702941895s.
-[triton-dejavu] First execution including JIT compilation took 0.9478855133056641s.
-[triton-dejavu] First execution including JIT compilation took 0.5432147979736328s.
-[triton-dejavu] First execution including JIT compilation took 0.38411760330200195s.
-[triton-dejavu] First execution including JIT compilation took 1.0501837730407715s.
-[triton-dejavu] First execution including JIT compilation took 0.5907988548278809s.
-[triton-dejavu] First execution including JIT compilation took 0.39850473403930664s.
-[triton-dejavu] First execution including JIT compilation took 1.1722888946533203s.
-[triton-dejavu] First execution including JIT compilation took 0.6436972618103027s.
-[triton-dejavu] First execution including JIT compilation took 0.42680954933166504s.
-[triton-dejavu] First execution including JIT compilation took 1.3340017795562744s.
-[triton-dejavu] First execution including JIT compilation took 0.5718722343444824s.
-[triton-dejavu] First execution including JIT compilation took 0.38933897018432617s.
-[triton-dejavu] First execution including JIT compilation took 0.6949644088745117s.
-[triton-dejavu] First execution including JIT compilation took 0.3732309341430664s.
-[triton-dejavu] First execution including JIT compilation took 0.26645493507385254s.
-[triton-dejavu] First execution including JIT compilation took 0.6677834987640381s.
-[triton-dejavu] First execution including JIT compilation took 0.5330057144165039s.
-[triton-dejavu] First execution including JIT compilation took 0.3234426975250244s.
-[triton-dejavu] First execution including JIT compilation took 1.226240634918213s.
-[triton-dejavu] First execution including JIT compilation took 0.7037711143493652s.
-[triton-dejavu] First execution including JIT compilation took 0.35811614990234375s.
-[triton-dejavu] First execution including JIT compilation took 1.223371982574463s.
-[triton-dejavu] First execution including JIT compilation took 0.7030131816864014s.
-[triton-dejavu] First execution including JIT compilation took 0.44534802436828613s.
-[triton-dejavu] First execution including JIT compilation took 1.3601360321044922s.
-[triton-dejavu] First execution including JIT compilation took 0.8273930549621582s.
-[triton-dejavu] First execution including JIT compilation took 0.4698348045349121s.
-[triton-dejavu] First execution including JIT compilation took 1.3899588584899902s.
-[triton-dejavu] First execution including JIT compilation took 0.9071271419525146s.
-[triton-dejavu] First execution including JIT compilation took 0.47567152976989746s.
-bench_cudagraph failed with out of resource: shared memory, Required: 249088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.3848598003387451s.
-[triton-dejavu] First execution including JIT compilation took 0.5705807209014893s.
-[triton-dejavu] First execution including JIT compilation took 0.3730134963989258s.
-[triton-dejavu] First execution including JIT compilation took 1.7507147789001465s.
-[triton-dejavu] First execution including JIT compilation took 0.739149808883667s.
-[triton-dejavu] First execution including JIT compilation took 0.5719029903411865s.
-[triton-dejavu] First execution including JIT compilation took 5.391368865966797s.
-[triton-dejavu] First execution including JIT compilation took 1.2137444019317627s.
-[triton-dejavu] First execution including JIT compilation took 0.6304950714111328s.
-[triton-dejavu] First execution including JIT compilation took 5.735509157180786s.
-[triton-dejavu] First execution including JIT compilation took 1.278113603591919s.
-bench_cudagraph failed with out of resource: shared memory, Required: 279040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 279040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 350208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 350208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 421376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 421376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 563712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 563712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.086903095245361s.
-[triton-dejavu] First execution including JIT compilation took 1.3743896484375s.
-[triton-dejavu] First execution including JIT compilation took 0.5919761657714844s.
-[triton-dejavu] First execution including JIT compilation took 5.130200147628784s.
-[triton-dejavu] First execution including JIT compilation took 1.6156775951385498s.
-[triton-dejavu] First execution including JIT compilation took 0.8351900577545166s.
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.17352676391601562s.
-[triton-dejavu] First execution including JIT compilation took 0.16042280197143555s.
-[triton-dejavu] First execution including JIT compilation took 0.15305304527282715s.
-[triton-dejavu] First execution including JIT compilation took 0.17835307121276855s.
-[triton-dejavu] First execution including JIT compilation took 0.18025875091552734s.
-[triton-dejavu] First execution including JIT compilation took 0.1706397533416748s.
-[triton-dejavu] First execution including JIT compilation took 0.18474626541137695s.
-[triton-dejavu] First execution including JIT compilation took 0.1816706657409668s.
-[triton-dejavu] First execution including JIT compilation took 0.17277026176452637s.
-[triton-dejavu] First execution including JIT compilation took 0.19310617446899414s.
-[triton-dejavu] First execution including JIT compilation took 0.19066524505615234s.
-[triton-dejavu] First execution including JIT compilation took 0.18411707878112793s.
-[triton-dejavu] First execution including JIT compilation took 0.20400023460388184s.
-[triton-dejavu] First execution including JIT compilation took 0.21816563606262207s.
-[triton-dejavu] First execution including JIT compilation took 0.19965767860412598s.
-[triton-dejavu] First execution including JIT compilation took 0.23587703704833984s.
-[triton-dejavu] First execution including JIT compilation took 0.22630786895751953s.
-[triton-dejavu] First execution including JIT compilation took 0.20632338523864746s.
-[triton-dejavu] First execution including JIT compilation took 0.26639699935913086s.
-[triton-dejavu] First execution including JIT compilation took 0.24132418632507324s.
-[triton-dejavu] First execution including JIT compilation took 0.22358036041259766s.
-[triton-dejavu] First execution including JIT compilation took 0.1937398910522461s.
-[triton-dejavu] First execution including JIT compilation took 0.15766239166259766s.
-[triton-dejavu] First execution including JIT compilation took 0.1767878532409668s.
-[triton-dejavu] First execution including JIT compilation took 0.19301867485046387s.
-[triton-dejavu] First execution including JIT compilation took 0.19379115104675293s.
-[triton-dejavu] First execution including JIT compilation took 0.17388463020324707s.
-[triton-dejavu] First execution including JIT compilation took 0.2268962860107422s.
-[triton-dejavu] First execution including JIT compilation took 0.2022707462310791s.
-[triton-dejavu] First execution including JIT compilation took 0.19673395156860352s.
-[triton-dejavu] First execution including JIT compilation took 0.22176003456115723s.
-[triton-dejavu] First execution including JIT compilation took 0.20571017265319824s.
-[triton-dejavu] First execution including JIT compilation took 0.21979308128356934s.
-[triton-dejavu] First execution including JIT compilation took 0.23570489883422852s.
-[triton-dejavu] First execution including JIT compilation took 0.22661423683166504s.
-[triton-dejavu] First execution including JIT compilation took 0.22267603874206543s.
-[triton-dejavu] First execution including JIT compilation took 0.24321699142456055s.
-[triton-dejavu] First execution including JIT compilation took 0.23399901390075684s.
-[triton-dejavu] First execution including JIT compilation took 0.22303104400634766s.
-[triton-dejavu] First execution including JIT compilation took 0.28810644149780273s.
-[triton-dejavu] First execution including JIT compilation took 0.23264646530151367s.
-[triton-dejavu] First execution including JIT compilation took 0.2400953769683838s.
-[triton-dejavu] First execution including JIT compilation took 0.21341371536254883s.
-[triton-dejavu] First execution including JIT compilation took 0.19530224800109863s.
-[triton-dejavu] First execution including JIT compilation took 0.1753242015838623s.
-[triton-dejavu] First execution including JIT compilation took 0.24271106719970703s.
-[triton-dejavu] First execution including JIT compilation took 0.22337555885314941s.
-[triton-dejavu] First execution including JIT compilation took 0.20344924926757812s.
-[triton-dejavu] First execution including JIT compilation took 0.26564812660217285s.
-[triton-dejavu] First execution including JIT compilation took 0.22059965133666992s.
-[triton-dejavu] First execution including JIT compilation took 0.19876718521118164s.
-[triton-dejavu] First execution including JIT compilation took 0.3027980327606201s.
-[triton-dejavu] First execution including JIT compilation took 0.2440967559814453s.
-[triton-dejavu] First execution including JIT compilation took 0.21737980842590332s.
-[triton-dejavu] First execution including JIT compilation took 0.3214104175567627s.
-[triton-dejavu] First execution including JIT compilation took 0.23887038230895996s.
-[triton-dejavu] First execution including JIT compilation took 0.22879958152770996s.
-[triton-dejavu] First execution including JIT compilation took 0.31365513801574707s.
-[triton-dejavu] First execution including JIT compilation took 0.2629280090332031s.
-[triton-dejavu] First execution including JIT compilation took 0.22771596908569336s.
-[triton-dejavu] First execution including JIT compilation took 0.40690040588378906s.
-[triton-dejavu] First execution including JIT compilation took 0.32520389556884766s.
-[triton-dejavu] First execution including JIT compilation took 0.2640228271484375s.
-[triton-dejavu] First execution including JIT compilation took 0.2796199321746826s.
-[triton-dejavu] First execution including JIT compilation took 0.21073007583618164s.
-[triton-dejavu] First execution including JIT compilation took 0.1934361457824707s.
-[triton-dejavu] First execution including JIT compilation took 0.34839892387390137s.
-[triton-dejavu] First execution including JIT compilation took 0.3115088939666748s.
-[triton-dejavu] First execution including JIT compilation took 0.20244383811950684s.
-[triton-dejavu] First execution including JIT compilation took 0.38748598098754883s.
-[triton-dejavu] First execution including JIT compilation took 0.3139615058898926s.
-[triton-dejavu] First execution including JIT compilation took 0.22042202949523926s.
-[triton-dejavu] First execution including JIT compilation took 0.4271514415740967s.
-[triton-dejavu] First execution including JIT compilation took 0.34604549407958984s.
-[triton-dejavu] First execution including JIT compilation took 0.22012782096862793s.
-[triton-dejavu] First execution including JIT compilation took 0.5421981811523438s.
-[triton-dejavu] First execution including JIT compilation took 0.3638300895690918s.
-[triton-dejavu] First execution including JIT compilation took 0.23948025703430176s.
-[triton-dejavu] First execution including JIT compilation took 0.4606790542602539s.
-[triton-dejavu] First execution including JIT compilation took 0.3932468891143799s.
-[triton-dejavu] First execution including JIT compilation took 0.26195645332336426s.
-[triton-dejavu] First execution including JIT compilation took 0.5043284893035889s.
-[triton-dejavu] First execution including JIT compilation took 0.4588782787322998s.
-[triton-dejavu] First execution including JIT compilation took 0.2814829349517822s.
-[triton-dejavu] First execution including JIT compilation took 0.39888525009155273s.
-[triton-dejavu] First execution including JIT compilation took 0.2917821407318115s.
-[triton-dejavu] First execution including JIT compilation took 0.2808530330657959s.
-[triton-dejavu] First execution including JIT compilation took 0.6014502048492432s.
-[triton-dejavu] First execution including JIT compilation took 0.5006814002990723s.
-[triton-dejavu] First execution including JIT compilation took 0.4271118640899658s.
-[triton-dejavu] First execution including JIT compilation took 0.7629735469818115s.
-[triton-dejavu] First execution including JIT compilation took 0.49445056915283203s.
-[triton-dejavu] First execution including JIT compilation took 0.4509761333465576s.
-[triton-dejavu] First execution including JIT compilation took 0.7416894435882568s.
-[triton-dejavu] First execution including JIT compilation took 0.5215311050415039s.
-[triton-dejavu] First execution including JIT compilation took 0.4689524173736572s.
-[triton-dejavu] First execution including JIT compilation took 0.7672626972198486s.
-[triton-dejavu] First execution including JIT compilation took 0.6135916709899902s.
-[triton-dejavu] First execution including JIT compilation took 0.49275946617126465s.
-[triton-dejavu] First execution including JIT compilation took 0.9401953220367432s.
-[triton-dejavu] First execution including JIT compilation took 0.5827491283416748s.
-[triton-dejavu] First execution including JIT compilation took 0.4996645450592041s.
-[triton-dejavu] First execution including JIT compilation took 0.9401323795318604s.
-[triton-dejavu] First execution including JIT compilation took 0.668349027633667s.
-[triton-dejavu] First execution including JIT compilation took 0.5485968589782715s.
-[triton-dejavu] First execution including JIT compilation took 0.18768930435180664s.
-[triton-dejavu] First execution including JIT compilation took 0.16661763191223145s.
-[triton-dejavu] First execution including JIT compilation took 0.17052841186523438s.
-[triton-dejavu] First execution including JIT compilation took 0.2100682258605957s.
-[triton-dejavu] First execution including JIT compilation took 0.20937323570251465s.
-[triton-dejavu] First execution including JIT compilation took 0.19020938873291016s.
-[triton-dejavu] First execution including JIT compilation took 0.20560169219970703s.
-[triton-dejavu] First execution including JIT compilation took 0.19290709495544434s.
-[triton-dejavu] First execution including JIT compilation took 0.19777560234069824s.
-[triton-dejavu] First execution including JIT compilation took 0.21995210647583008s.
-[triton-dejavu] First execution including JIT compilation took 0.21872901916503906s.
-[triton-dejavu] First execution including JIT compilation took 0.2034306526184082s.
-[triton-dejavu] First execution including JIT compilation took 0.24239134788513184s.
-[triton-dejavu] First execution including JIT compilation took 0.26946043968200684s.
-[triton-dejavu] First execution including JIT compilation took 0.2544829845428467s.
-[triton-dejavu] First execution including JIT compilation took 0.2922830581665039s.
-[triton-dejavu] First execution including JIT compilation took 0.27474284172058105s.
-[triton-dejavu] First execution including JIT compilation took 0.2743556499481201s.
-[triton-dejavu] First execution including JIT compilation took 0.31418609619140625s.
-[triton-dejavu] First execution including JIT compilation took 0.30026888847351074s.
-[triton-dejavu] First execution including JIT compilation took 0.28441858291625977s.
-[triton-dejavu] First execution including JIT compilation took 0.23394203186035156s.
-[triton-dejavu] First execution including JIT compilation took 0.20772600173950195s.
-[triton-dejavu] First execution including JIT compilation took 0.18996429443359375s.
-[triton-dejavu] First execution including JIT compilation took 0.27438807487487793s.
-[triton-dejavu] First execution including JIT compilation took 0.23485589027404785s.
-[triton-dejavu] First execution including JIT compilation took 0.2199420928955078s.
-[triton-dejavu] First execution including JIT compilation took 0.29147887229919434s.
-[triton-dejavu] First execution including JIT compilation took 0.2452247142791748s.
-[triton-dejavu] First execution including JIT compilation took 0.2305736541748047s.
-[triton-dejavu] First execution including JIT compilation took 0.30902743339538574s.
-[triton-dejavu] First execution including JIT compilation took 0.2559957504272461s.
-[triton-dejavu] First execution including JIT compilation took 0.2465808391571045s.
-[triton-dejavu] First execution including JIT compilation took 0.3298933506011963s.
-[triton-dejavu] First execution including JIT compilation took 0.27321410179138184s.
-[triton-dejavu] First execution including JIT compilation took 0.26524877548217773s.
-[triton-dejavu] First execution including JIT compilation took 0.34816527366638184s.
-[triton-dejavu] First execution including JIT compilation took 0.28248119354248047s.
-[triton-dejavu] First execution including JIT compilation took 0.267411470413208s.
-[triton-dejavu] First execution including JIT compilation took 0.4036557674407959s.
-[triton-dejavu] First execution including JIT compilation took 0.30405187606811523s.
-[triton-dejavu] First execution including JIT compilation took 0.3068065643310547s.
-[triton-dejavu] First execution including JIT compilation took 0.2875032424926758s.
-[triton-dejavu] First execution including JIT compilation took 0.23735547065734863s.
-[triton-dejavu] First execution including JIT compilation took 0.23032617568969727s.
-[triton-dejavu] First execution including JIT compilation took 0.3493824005126953s.
-[triton-dejavu] First execution including JIT compilation took 0.27472662925720215s.
-[triton-dejavu] First execution including JIT compilation took 0.2401866912841797s.
-[triton-dejavu] First execution including JIT compilation took 0.39062976837158203s.
-[triton-dejavu] First execution including JIT compilation took 0.29250192642211914s.
-[triton-dejavu] First execution including JIT compilation took 0.2570502758026123s.
-[triton-dejavu] First execution including JIT compilation took 0.3952975273132324s.
-[triton-dejavu] First execution including JIT compilation took 0.31146764755249023s.
-[triton-dejavu] First execution including JIT compilation took 0.26107025146484375s.
-[triton-dejavu] First execution including JIT compilation took 0.43129730224609375s.
-[triton-dejavu] First execution including JIT compilation took 0.3286442756652832s.
-[triton-dejavu] First execution including JIT compilation took 0.2835230827331543s.
-[triton-dejavu] First execution including JIT compilation took 0.43753743171691895s.
-[triton-dejavu] First execution including JIT compilation took 0.34508848190307617s.
-[triton-dejavu] First execution including JIT compilation took 0.2861642837524414s.
-[triton-dejavu] First execution including JIT compilation took 0.48541975021362305s.
-[triton-dejavu] First execution including JIT compilation took 0.3953580856323242s.
-[triton-dejavu] First execution including JIT compilation took 0.31298136711120605s.
-[triton-dejavu] First execution including JIT compilation took 0.3618011474609375s.
-[triton-dejavu] First execution including JIT compilation took 0.29604053497314453s.
-[triton-dejavu] First execution including JIT compilation took 0.2307584285736084s.
-[triton-dejavu] First execution including JIT compilation took 0.4865717887878418s.
-[triton-dejavu] First execution including JIT compilation took 0.40287113189697266s.
-[triton-dejavu] First execution including JIT compilation took 0.27056026458740234s.
-[triton-dejavu] First execution including JIT compilation took 0.5447485446929932s.
-[triton-dejavu] First execution including JIT compilation took 0.430034875869751s.
-[triton-dejavu] First execution including JIT compilation took 0.30031275749206543s.
-[triton-dejavu] First execution including JIT compilation took 0.6063799858093262s.
-[triton-dejavu] First execution including JIT compilation took 0.4732537269592285s.
-[triton-dejavu] First execution including JIT compilation took 0.3194100856781006s.
-[triton-dejavu] First execution including JIT compilation took 0.6529583930969238s.
-[triton-dejavu] First execution including JIT compilation took 0.4868447780609131s.
-[triton-dejavu] First execution including JIT compilation took 0.35962390899658203s.
-[triton-dejavu] First execution including JIT compilation took 0.6952221393585205s.
-[triton-dejavu] First execution including JIT compilation took 0.5078432559967041s.
-[triton-dejavu] First execution including JIT compilation took 0.3716623783111572s.
-[triton-dejavu] First execution including JIT compilation took 0.7688384056091309s.
-[triton-dejavu] First execution including JIT compilation took 0.5738773345947266s.
-[triton-dejavu] First execution including JIT compilation took 0.40444135665893555s.
-[triton-dejavu] First execution including JIT compilation took 0.5380966663360596s.
-[triton-dejavu] First execution including JIT compilation took 0.4179868698120117s.
-[triton-dejavu] First execution including JIT compilation took 0.2959005832672119s.
-[triton-dejavu] First execution including JIT compilation took 0.8164780139923096s.
-[triton-dejavu] First execution including JIT compilation took 0.6937565803527832s.
-[triton-dejavu] First execution including JIT compilation took 0.49874210357666016s.
-[triton-dejavu] First execution including JIT compilation took 1.0514369010925293s.
-[triton-dejavu] First execution including JIT compilation took 0.7419230937957764s.
-[triton-dejavu] First execution including JIT compilation took 0.5654633045196533s.
-[triton-dejavu] First execution including JIT compilation took 1.0287201404571533s.
-[triton-dejavu] First execution including JIT compilation took 0.7904648780822754s.
-[triton-dejavu] First execution including JIT compilation took 0.5895709991455078s.
-[triton-dejavu] First execution including JIT compilation took 1.1943697929382324s.
-[triton-dejavu] First execution including JIT compilation took 0.8456614017486572s.
-[triton-dejavu] First execution including JIT compilation took 0.6484768390655518s.
-[triton-dejavu] First execution including JIT compilation took 1.2749860286712646s.
-[triton-dejavu] First execution including JIT compilation took 0.906667947769165s.
-[triton-dejavu] First execution including JIT compilation took 0.6650793552398682s.
-bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.2316875457763672s.
-[triton-dejavu] First execution including JIT compilation took 0.17262887954711914s.
-[triton-dejavu] First execution including JIT compilation took 0.16709303855895996s.
-[triton-dejavu] First execution including JIT compilation took 0.2316112518310547s.
-[triton-dejavu] First execution including JIT compilation took 0.19475674629211426s.
-[triton-dejavu] First execution including JIT compilation took 0.2070786952972412s.
-[triton-dejavu] First execution including JIT compilation took 0.26039981842041016s.
-[triton-dejavu] First execution including JIT compilation took 0.20882773399353027s.
-[triton-dejavu] First execution including JIT compilation took 0.20374321937561035s.
-[triton-dejavu] First execution including JIT compilation took 0.27775073051452637s.
-[triton-dejavu] First execution including JIT compilation took 0.2467350959777832s.
-[triton-dejavu] First execution including JIT compilation took 0.21544861793518066s.
-[triton-dejavu] First execution including JIT compilation took 0.32980918884277344s.
-[triton-dejavu] First execution including JIT compilation took 0.21561956405639648s.
-[triton-dejavu] First execution including JIT compilation took 0.2177584171295166s.
-[triton-dejavu] First execution including JIT compilation took 0.32271885871887207s.
-[triton-dejavu] First execution including JIT compilation took 0.24530315399169922s.
-[triton-dejavu] First execution including JIT compilation took 0.2153329849243164s.
-[triton-dejavu] First execution including JIT compilation took 0.3502006530761719s.
-[triton-dejavu] First execution including JIT compilation took 0.28928208351135254s.
-[triton-dejavu] First execution including JIT compilation took 0.22752094268798828s.
-[triton-dejavu] First execution including JIT compilation took 0.22658753395080566s.
-[triton-dejavu] First execution including JIT compilation took 0.19278335571289062s.
-[triton-dejavu] First execution including JIT compilation took 0.18082761764526367s.
-[triton-dejavu] First execution including JIT compilation took 0.2848987579345703s.
-[triton-dejavu] First execution including JIT compilation took 0.23020219802856445s.
-[triton-dejavu] First execution including JIT compilation took 0.18162894248962402s.
-[triton-dejavu] First execution including JIT compilation took 0.37184619903564453s.
-[triton-dejavu] First execution including JIT compilation took 0.29797792434692383s.
-[triton-dejavu] First execution including JIT compilation took 0.28612470626831055s.
-[triton-dejavu] First execution including JIT compilation took 0.4051649570465088s.
-[triton-dejavu] First execution including JIT compilation took 0.32303476333618164s.
-[triton-dejavu] First execution including JIT compilation took 0.2697916030883789s.
-[triton-dejavu] First execution including JIT compilation took 0.4405784606933594s.
-[triton-dejavu] First execution including JIT compilation took 0.34795689582824707s.
-[triton-dejavu] First execution including JIT compilation took 0.2898232936859131s.
-[triton-dejavu] First execution including JIT compilation took 0.4761343002319336s.
-[triton-dejavu] First execution including JIT compilation took 0.36168718338012695s.
-[triton-dejavu] First execution including JIT compilation took 0.28768467903137207s.
-[triton-dejavu] First execution including JIT compilation took 0.5420176982879639s.
-[triton-dejavu] First execution including JIT compilation took 0.38568615913391113s.
-[triton-dejavu] First execution including JIT compilation took 0.3170638084411621s.
-[triton-dejavu] First execution including JIT compilation took 0.35248708724975586s.
-[triton-dejavu] First execution including JIT compilation took 0.273029088973999s.
-[triton-dejavu] First execution including JIT compilation took 0.23226165771484375s.
-[triton-dejavu] First execution including JIT compilation took 0.41886162757873535s.
-[triton-dejavu] First execution including JIT compilation took 0.3393113613128662s.
-[triton-dejavu] First execution including JIT compilation took 0.26583361625671387s.
-[triton-dejavu] First execution including JIT compilation took 0.46086597442626953s.
-[triton-dejavu] First execution including JIT compilation took 0.3681511878967285s.
-[triton-dejavu] First execution including JIT compilation took 0.28913354873657227s.
-[triton-dejavu] First execution including JIT compilation took 0.49338269233703613s.
-[triton-dejavu] First execution including JIT compilation took 0.39551806449890137s.
-[triton-dejavu] First execution including JIT compilation took 0.3077273368835449s.
-[triton-dejavu] First execution including JIT compilation took 0.5245680809020996s.
-[triton-dejavu] First execution including JIT compilation took 0.4535055160522461s.
-[triton-dejavu] First execution including JIT compilation took 0.32816529273986816s.
-[triton-dejavu] First execution including JIT compilation took 0.563164234161377s.
-[triton-dejavu] First execution including JIT compilation took 0.4629950523376465s.
-[triton-dejavu] First execution including JIT compilation took 0.34805798530578613s.
-[triton-dejavu] First execution including JIT compilation took 0.620722770690918s.
-[triton-dejavu] First execution including JIT compilation took 0.5199508666992188s.
-[triton-dejavu] First execution including JIT compilation took 0.38794612884521484s.
-[triton-dejavu] First execution including JIT compilation took 0.5147454738616943s.
-[triton-dejavu] First execution including JIT compilation took 0.3552286624908447s.
-[triton-dejavu] First execution including JIT compilation took 0.28772640228271484s.
-[triton-dejavu] First execution including JIT compilation took 0.6648948192596436s.
-[triton-dejavu] First execution including JIT compilation took 0.47719812393188477s.
-[triton-dejavu] First execution including JIT compilation took 0.34389352798461914s.
-[triton-dejavu] First execution including JIT compilation took 0.767352819442749s.
-[triton-dejavu] First execution including JIT compilation took 0.5330626964569092s.
-[triton-dejavu] First execution including JIT compilation took 0.37920188903808594s.
-[triton-dejavu] First execution including JIT compilation took 0.7848920822143555s.
-[triton-dejavu] First execution including JIT compilation took 0.47530531883239746s.
-[triton-dejavu] First execution including JIT compilation took 0.33605313301086426s.
-[triton-dejavu] First execution including JIT compilation took 0.7004132270812988s.
-[triton-dejavu] First execution including JIT compilation took 0.4657857418060303s.
-[triton-dejavu] First execution including JIT compilation took 0.3541529178619385s.
-[triton-dejavu] First execution including JIT compilation took 0.7426049709320068s.
-[triton-dejavu] First execution including JIT compilation took 0.538907527923584s.
-[triton-dejavu] First execution including JIT compilation took 0.3655426502227783s.
-[triton-dejavu] First execution including JIT compilation took 0.8675262928009033s.
-[triton-dejavu] First execution including JIT compilation took 0.5515866279602051s.
-[triton-dejavu] First execution including JIT compilation took 0.4889793395996094s.
-[triton-dejavu] First execution including JIT compilation took 0.9458072185516357s.
-[triton-dejavu] First execution including JIT compilation took 0.5496277809143066s.
-[triton-dejavu] First execution including JIT compilation took 0.3970763683319092s.
-[triton-dejavu] First execution including JIT compilation took 1.425358772277832s.
-[triton-dejavu] First execution including JIT compilation took 0.8153319358825684s.
-[triton-dejavu] First execution including JIT compilation took 0.6550781726837158s.
-[triton-dejavu] First execution including JIT compilation took 2.0274274349212646s.
-[triton-dejavu] First execution including JIT compilation took 0.9665567874908447s.
-[triton-dejavu] First execution including JIT compilation took 0.7323272228240967s.
-[triton-dejavu] First execution including JIT compilation took 2.0035274028778076s.
-[triton-dejavu] First execution including JIT compilation took 0.794562578201294s.
-[triton-dejavu] First execution including JIT compilation took 0.5924549102783203s.
-[triton-dejavu] First execution including JIT compilation took 1.8792035579681396s.
-[triton-dejavu] First execution including JIT compilation took 0.8279719352722168s.
-[triton-dejavu] First execution including JIT compilation took 0.5921733379364014s.
-bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.25391125679016113s.
-[triton-dejavu] First execution including JIT compilation took 0.20267081260681152s.
-[triton-dejavu] First execution including JIT compilation took 0.16582441329956055s.
-[triton-dejavu] First execution including JIT compilation took 0.2948935031890869s.
-[triton-dejavu] First execution including JIT compilation took 0.22772979736328125s.
-[triton-dejavu] First execution including JIT compilation took 0.18806099891662598s.
-[triton-dejavu] First execution including JIT compilation took 0.3108959197998047s.
-[triton-dejavu] First execution including JIT compilation took 0.344806432723999s.
-[triton-dejavu] First execution including JIT compilation took 0.20200371742248535s.
-[triton-dejavu] First execution including JIT compilation took 0.3476550579071045s.
-[triton-dejavu] First execution including JIT compilation took 0.2549312114715576s.
-[triton-dejavu] First execution including JIT compilation took 0.22494268417358398s.
-[triton-dejavu] First execution including JIT compilation took 0.37220120429992676s.
-[triton-dejavu] First execution including JIT compilation took 0.2809004783630371s.
-[triton-dejavu] First execution including JIT compilation took 0.22059869766235352s.
-[triton-dejavu] First execution including JIT compilation took 0.388761043548584s.
-[triton-dejavu] First execution including JIT compilation took 0.3077573776245117s.
-[triton-dejavu] First execution including JIT compilation took 0.28969645500183105s.
-[triton-dejavu] First execution including JIT compilation took 0.4982035160064697s.
-[triton-dejavu] First execution including JIT compilation took 0.39230942726135254s.
-[triton-dejavu] First execution including JIT compilation took 0.2644228935241699s.
-[triton-dejavu] First execution including JIT compilation took 0.3611593246459961s.
-[triton-dejavu] First execution including JIT compilation took 0.2406003475189209s.
-[triton-dejavu] First execution including JIT compilation took 0.20929169654846191s.
-[triton-dejavu] First execution including JIT compilation took 0.4092109203338623s.
-[triton-dejavu] First execution including JIT compilation took 0.2963707447052002s.
-[triton-dejavu] First execution including JIT compilation took 0.2378685474395752s.
-[triton-dejavu] First execution including JIT compilation took 0.45641469955444336s.
-[triton-dejavu] First execution including JIT compilation took 0.32480573654174805s.
-[triton-dejavu] First execution including JIT compilation took 0.2426598072052002s.
-[triton-dejavu] First execution including JIT compilation took 0.5122194290161133s.
-[triton-dejavu] First execution including JIT compilation took 0.3030378818511963s.
-[triton-dejavu] First execution including JIT compilation took 0.24106526374816895s.
-[triton-dejavu] First execution including JIT compilation took 0.4959719181060791s.
-[triton-dejavu] First execution including JIT compilation took 0.4293406009674072s.
-[triton-dejavu] First execution including JIT compilation took 0.32636475563049316s.
-[triton-dejavu] First execution including JIT compilation took 0.7202773094177246s.
-[triton-dejavu] First execution including JIT compilation took 0.4574899673461914s.
-[triton-dejavu] First execution including JIT compilation took 0.3512580394744873s.
-[triton-dejavu] First execution including JIT compilation took 0.8339159488677979s.
-[triton-dejavu] First execution including JIT compilation took 0.5235207080841064s.
-[triton-dejavu] First execution including JIT compilation took 0.3777124881744385s.
-[triton-dejavu] First execution including JIT compilation took 0.5410404205322266s.
-[triton-dejavu] First execution including JIT compilation took 0.37018918991088867s.
-[triton-dejavu] First execution including JIT compilation took 0.27248096466064453s.
-[triton-dejavu] First execution including JIT compilation took 0.6164650917053223s.
-[triton-dejavu] First execution including JIT compilation took 0.4583768844604492s.
-[triton-dejavu] First execution including JIT compilation took 0.32603883743286133s.
-[triton-dejavu] First execution including JIT compilation took 0.7062644958496094s.
-[triton-dejavu] First execution including JIT compilation took 0.515678882598877s.
-[triton-dejavu] First execution including JIT compilation took 0.36606812477111816s.
-[triton-dejavu] First execution including JIT compilation took 0.7721257209777832s.
-[triton-dejavu] First execution including JIT compilation took 0.5739889144897461s.
-[triton-dejavu] First execution including JIT compilation took 0.4048495292663574s.
-[triton-dejavu] First execution including JIT compilation took 0.8282396793365479s.
-[triton-dejavu] First execution including JIT compilation took 0.5025653839111328s.
-[triton-dejavu] First execution including JIT compilation took 0.33838868141174316s.
-[triton-dejavu] First execution including JIT compilation took 0.7806441783905029s.
-[triton-dejavu] First execution including JIT compilation took 0.6090381145477295s.
-[triton-dejavu] First execution including JIT compilation took 0.3753626346588135s.
-[triton-dejavu] First execution including JIT compilation took 0.7488856315612793s.
-[triton-dejavu] First execution including JIT compilation took 0.7003397941589355s.
-[triton-dejavu] First execution including JIT compilation took 0.41066956520080566s.
-[triton-dejavu] First execution including JIT compilation took 0.7540671825408936s.
-[triton-dejavu] First execution including JIT compilation took 0.4108397960662842s.
-[triton-dejavu] First execution including JIT compilation took 0.28084588050842285s.
-[triton-dejavu] First execution including JIT compilation took 0.8834891319274902s.
-[triton-dejavu] First execution including JIT compilation took 0.49424219131469727s.
-[triton-dejavu] First execution including JIT compilation took 0.39174604415893555s.
-[triton-dejavu] First execution including JIT compilation took 1.3143653869628906s.
-[triton-dejavu] First execution including JIT compilation took 0.646043062210083s.
-[triton-dejavu] First execution including JIT compilation took 0.626563549041748s.
-[triton-dejavu] First execution including JIT compilation took 1.4293315410614014s.
-[triton-dejavu] First execution including JIT compilation took 0.6236376762390137s.
-[triton-dejavu] First execution including JIT compilation took 0.48520755767822266s.
-[triton-dejavu] First execution including JIT compilation took 1.6379265785217285s.
-[triton-dejavu] First execution including JIT compilation took 0.7366843223571777s.
-[triton-dejavu] First execution including JIT compilation took 0.4963555335998535s.
-[triton-dejavu] First execution including JIT compilation took 1.574018955230713s.
-[triton-dejavu] First execution including JIT compilation took 0.7276029586791992s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.557365894317627s.
-[triton-dejavu] First execution including JIT compilation took 0.6995244026184082s.
-[triton-dejavu] First execution including JIT compilation took 0.42113590240478516s.
-[triton-dejavu] First execution including JIT compilation took 2.4745399951934814s.
-[triton-dejavu] First execution including JIT compilation took 0.9797840118408203s.
-[triton-dejavu] First execution including JIT compilation took 0.6241872310638428s.
-[triton-dejavu] First execution including JIT compilation took 6.13117790222168s.
-[triton-dejavu] First execution including JIT compilation took 1.4725189208984375s.
-[triton-dejavu] First execution including JIT compilation took 0.681943416595459s.
-bench_cudagraph failed with out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.3794093132019043s.
-[triton-dejavu] First execution including JIT compilation took 0.28264904022216797s.
-[triton-dejavu] First execution including JIT compilation took 0.22452759742736816s.
-[triton-dejavu] First execution including JIT compilation took 0.559002161026001s.
-[triton-dejavu] First execution including JIT compilation took 0.3224790096282959s.
-[triton-dejavu] First execution including JIT compilation took 0.28389406204223633s.
-[triton-dejavu] First execution including JIT compilation took 0.5182971954345703s.
-[triton-dejavu] First execution including JIT compilation took 0.345487117767334s.
-[triton-dejavu] First execution including JIT compilation took 0.2435622215270996s.
-[triton-dejavu] First execution including JIT compilation took 0.5336036682128906s.
-[triton-dejavu] First execution including JIT compilation took 0.3894071578979492s.
-[triton-dejavu] First execution including JIT compilation took 0.3008308410644531s.
-[triton-dejavu] First execution including JIT compilation took 0.7498984336853027s.
-[triton-dejavu] First execution including JIT compilation took 0.41705965995788574s.
-[triton-dejavu] First execution including JIT compilation took 0.2856142520904541s.
-[triton-dejavu] First execution including JIT compilation took 0.7986507415771484s.
-[triton-dejavu] First execution including JIT compilation took 0.506192684173584s.
-[triton-dejavu] First execution including JIT compilation took 0.35767054557800293s.
-[triton-dejavu] First execution including JIT compilation took 0.9271838665008545s.
-[triton-dejavu] First execution including JIT compilation took 0.5614745616912842s.
-[triton-dejavu] First execution including JIT compilation took 0.39832496643066406s.
-[triton-dejavu] First execution including JIT compilation took 0.6550092697143555s.
-[triton-dejavu] First execution including JIT compilation took 0.4102933406829834s.
-[triton-dejavu] First execution including JIT compilation took 0.28809452056884766s.
-[triton-dejavu] First execution including JIT compilation took 0.8442857265472412s.
-[triton-dejavu] First execution including JIT compilation took 0.49399375915527344s.
-[triton-dejavu] First execution including JIT compilation took 0.3414480686187744s.
-[triton-dejavu] First execution including JIT compilation took 0.9948995113372803s.
-[triton-dejavu] First execution including JIT compilation took 0.544846773147583s.
-[triton-dejavu] First execution including JIT compilation took 0.36998677253723145s.
-[triton-dejavu] First execution including JIT compilation took 1.1347663402557373s.
-[triton-dejavu] First execution including JIT compilation took 0.5956213474273682s.
-[triton-dejavu] First execution including JIT compilation took 0.41924381256103516s.
-[triton-dejavu] First execution including JIT compilation took 1.2498164176940918s.
-[triton-dejavu] First execution including JIT compilation took 0.6886944770812988s.
-[triton-dejavu] First execution including JIT compilation took 0.45352959632873535s.
-[triton-dejavu] First execution including JIT compilation took 1.3488807678222656s.
-[triton-dejavu] First execution including JIT compilation took 0.7345826625823975s.
-[triton-dejavu] First execution including JIT compilation took 0.4611852169036865s.
-[triton-dejavu] First execution including JIT compilation took 1.6846129894256592s.
-[triton-dejavu] First execution including JIT compilation took 0.8527877330780029s.
-[triton-dejavu] First execution including JIT compilation took 0.519357442855835s.
-[triton-dejavu] First execution including JIT compilation took 0.9926292896270752s.
-[triton-dejavu] First execution including JIT compilation took 0.5671131610870361s.
-[triton-dejavu] First execution including JIT compilation took 0.36908459663391113s.
-[triton-dejavu] First execution including JIT compilation took 1.1392111778259277s.
-[triton-dejavu] First execution including JIT compilation took 0.7338624000549316s.
-[triton-dejavu] First execution including JIT compilation took 0.37808799743652344s.
-[triton-dejavu] First execution including JIT compilation took 1.32969069480896s.
-[triton-dejavu] First execution including JIT compilation took 0.7195644378662109s.
-[triton-dejavu] First execution including JIT compilation took 0.43347692489624023s.
-[triton-dejavu] First execution including JIT compilation took 1.5576729774475098s.
-[triton-dejavu] First execution including JIT compilation took 0.780888557434082s.
-[triton-dejavu] First execution including JIT compilation took 0.5686335563659668s.
-[triton-dejavu] First execution including JIT compilation took 1.5757191181182861s.
-[triton-dejavu] First execution including JIT compilation took 1.1339452266693115s.
-[triton-dejavu] First execution including JIT compilation took 0.6171472072601318s.
-[triton-dejavu] First execution including JIT compilation took 1.9367270469665527s.
-[triton-dejavu] First execution including JIT compilation took 1.2703828811645508s.
-[triton-dejavu] First execution including JIT compilation took 0.6219308376312256s.
-bench_cudagraph failed with out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.229659080505371s.
-[triton-dejavu] First execution including JIT compilation took 0.6827125549316406s.
-[triton-dejavu] First execution including JIT compilation took 0.4279639720916748s.
-[triton-dejavu] First execution including JIT compilation took 2.214158535003662s.
-[triton-dejavu] First execution including JIT compilation took 0.847602367401123s.
-[triton-dejavu] First execution including JIT compilation took 0.5684854984283447s.
-[triton-dejavu] First execution including JIT compilation took 5.671643257141113s.
-[triton-dejavu] First execution including JIT compilation took 1.3386998176574707s.
-[triton-dejavu] First execution including JIT compilation took 0.7006118297576904s.
-[triton-dejavu] First execution including JIT compilation took 6.009850025177002s.
-[triton-dejavu] First execution including JIT compilation took 1.425264596939087s.
-bench_cudagraph failed with out of resource: shared memory, Required: 291328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 291328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 441856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 441856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 592384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 592384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.9762208461761475s.
-[triton-dejavu] First execution including JIT compilation took 1.5692176818847656s.
-[triton-dejavu] First execution including JIT compilation took 0.7641324996948242s.
-[triton-dejavu] First execution including JIT compilation took 6.608908176422119s.
-[triton-dejavu] First execution including JIT compilation took 2.132209062576294s.
-[triton-dejavu] First execution including JIT compilation took 0.9537761211395264s.
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.1910562515258789s.
-[triton-dejavu] First execution including JIT compilation took 0.17246103286743164s.
-[triton-dejavu] First execution including JIT compilation took 0.17216134071350098s.
-[triton-dejavu] First execution including JIT compilation took 0.2674295902252197s.
-[triton-dejavu] First execution including JIT compilation took 0.2252979278564453s.
-[triton-dejavu] First execution including JIT compilation took 0.19235920906066895s.
-[triton-dejavu] First execution including JIT compilation took 0.21264386177062988s.
-[triton-dejavu] First execution including JIT compilation took 0.2161550521850586s.
-[triton-dejavu] First execution including JIT compilation took 0.1922304630279541s.
-[triton-dejavu] First execution including JIT compilation took 0.25547218322753906s.
-[triton-dejavu] First execution including JIT compilation took 0.27811694145202637s.
-[triton-dejavu] First execution including JIT compilation took 0.23806548118591309s.
-[triton-dejavu] First execution including JIT compilation took 0.3011937141418457s.
-[triton-dejavu] First execution including JIT compilation took 0.2934706211090088s.
-[triton-dejavu] First execution including JIT compilation took 0.25322985649108887s.
-[triton-dejavu] First execution including JIT compilation took 0.3128054141998291s.
-[triton-dejavu] First execution including JIT compilation took 0.28246212005615234s.
-[triton-dejavu] First execution including JIT compilation took 0.2546539306640625s.
-[triton-dejavu] First execution including JIT compilation took 0.33473944664001465s.
-[triton-dejavu] First execution including JIT compilation took 0.30516982078552246s.
-[triton-dejavu] First execution including JIT compilation took 0.27489733695983887s.
-[triton-dejavu] First execution including JIT compilation took 0.24262332916259766s.
-[triton-dejavu] First execution including JIT compilation took 0.2100353240966797s.
-[triton-dejavu] First execution including JIT compilation took 0.19793057441711426s.
-[triton-dejavu] First execution including JIT compilation took 0.2780025005340576s.
-[triton-dejavu] First execution including JIT compilation took 0.24424457550048828s.
-[triton-dejavu] First execution including JIT compilation took 0.231339693069458s.
-[triton-dejavu] First execution including JIT compilation took 0.29887890815734863s.
-[triton-dejavu] First execution including JIT compilation took 0.2637321949005127s.
-[triton-dejavu] First execution including JIT compilation took 0.24405384063720703s.
-[triton-dejavu] First execution including JIT compilation took 0.32925963401794434s.
-[triton-dejavu] First execution including JIT compilation took 0.28090524673461914s.
-[triton-dejavu] First execution including JIT compilation took 0.2658822536468506s.
-[triton-dejavu] First execution including JIT compilation took 0.34981393814086914s.
-[triton-dejavu] First execution including JIT compilation took 0.2969369888305664s.
-[triton-dejavu] First execution including JIT compilation took 0.273942232131958s.
-[triton-dejavu] First execution including JIT compilation took 0.37868213653564453s.
-[triton-dejavu] First execution including JIT compilation took 0.33127617835998535s.
-[triton-dejavu] First execution including JIT compilation took 0.3416633605957031s.
-[triton-dejavu] First execution including JIT compilation took 0.41475677490234375s.
-[triton-dejavu] First execution including JIT compilation took 0.33086156845092773s.
-[triton-dejavu] First execution including JIT compilation took 0.3177492618560791s.
-[triton-dejavu] First execution including JIT compilation took 0.3063650131225586s.
-[triton-dejavu] First execution including JIT compilation took 0.23031854629516602s.
-[triton-dejavu] First execution including JIT compilation took 0.21300745010375977s.
-[triton-dejavu] First execution including JIT compilation took 0.38768625259399414s.
-[triton-dejavu] First execution including JIT compilation took 0.2662017345428467s.
-[triton-dejavu] First execution including JIT compilation took 0.24376845359802246s.
-[triton-dejavu] First execution including JIT compilation took 0.42224621772766113s.
-[triton-dejavu] First execution including JIT compilation took 0.28191328048706055s.
-[triton-dejavu] First execution including JIT compilation took 0.273775577545166s.
-[triton-dejavu] First execution including JIT compilation took 0.4455993175506592s.
-[triton-dejavu] First execution including JIT compilation took 0.3689110279083252s.
-[triton-dejavu] First execution including JIT compilation took 0.26688480377197266s.
-[triton-dejavu] First execution including JIT compilation took 0.4688987731933594s.
-[triton-dejavu] First execution including JIT compilation took 0.31668877601623535s.
-[triton-dejavu] First execution including JIT compilation took 0.2852771282196045s.
-[triton-dejavu] First execution including JIT compilation took 0.5058488845825195s.
-[triton-dejavu] First execution including JIT compilation took 0.33969998359680176s.
-[triton-dejavu] First execution including JIT compilation took 0.3043205738067627s.
-[triton-dejavu] First execution including JIT compilation took 0.5594408512115479s.
-[triton-dejavu] First execution including JIT compilation took 0.38538432121276855s.
-[triton-dejavu] First execution including JIT compilation took 0.40354394912719727s.
-[triton-dejavu] First execution including JIT compilation took 0.4203341007232666s.
-[triton-dejavu] First execution including JIT compilation took 0.2790985107421875s.
-[triton-dejavu] First execution including JIT compilation took 0.197509765625s.
-[triton-dejavu] First execution including JIT compilation took 0.5050961971282959s.
-[triton-dejavu] First execution including JIT compilation took 0.2615811824798584s.
-[triton-dejavu] First execution including JIT compilation took 0.23754334449768066s.
-[triton-dejavu] First execution including JIT compilation took 0.5479357242584229s.
-[triton-dejavu] First execution including JIT compilation took 0.29597973823547363s.
-[triton-dejavu] First execution including JIT compilation took 0.22592473030090332s.
-[triton-dejavu] First execution including JIT compilation took 0.5904271602630615s.
-[triton-dejavu] First execution including JIT compilation took 0.3177652359008789s.
-[triton-dejavu] First execution including JIT compilation took 0.23325729370117188s.
-[triton-dejavu] First execution including JIT compilation took 0.6337690353393555s.
-[triton-dejavu] First execution including JIT compilation took 0.3158242702484131s.
-[triton-dejavu] First execution including JIT compilation took 0.26456284523010254s.
-[triton-dejavu] First execution including JIT compilation took 0.6728482246398926s.
-[triton-dejavu] First execution including JIT compilation took 0.3370821475982666s.
-[triton-dejavu] First execution including JIT compilation took 0.27890753746032715s.
-[triton-dejavu] First execution including JIT compilation took 0.7555828094482422s.
-[triton-dejavu] First execution including JIT compilation took 0.47994327545166016s.
-[triton-dejavu] First execution including JIT compilation took 0.3138282299041748s.
-[triton-dejavu] First execution including JIT compilation took 0.6586263179779053s.
-[triton-dejavu] First execution including JIT compilation took 0.2855665683746338s.
-[triton-dejavu] First execution including JIT compilation took 0.21575546264648438s.
-[triton-dejavu] First execution including JIT compilation took 0.8698668479919434s.
-[triton-dejavu] First execution including JIT compilation took 0.326815128326416s.
-[triton-dejavu] First execution including JIT compilation took 0.24704337120056152s.
-[triton-dejavu] First execution including JIT compilation took 1.3291542530059814s.
-[triton-dejavu] First execution including JIT compilation took 0.41158032417297363s.
-[triton-dejavu] First execution including JIT compilation took 0.2945075035095215s.
-[triton-dejavu] First execution including JIT compilation took 1.4427604675292969s.
-[triton-dejavu] First execution including JIT compilation took 0.4566466808319092s.
-[triton-dejavu] First execution including JIT compilation took 0.35230016708374023s.
-[triton-dejavu] First execution including JIT compilation took 1.5283832550048828s.
-[triton-dejavu] First execution including JIT compilation took 0.822779655456543s.
-[triton-dejavu] First execution including JIT compilation took 0.400043249130249s.
-[triton-dejavu] First execution including JIT compilation took 1.59427809715271s.
-bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.19841480255126953s.
-[triton-dejavu] First execution including JIT compilation took 0.19194793701171875s.
-[triton-dejavu] First execution including JIT compilation took 0.1582164764404297s.
-[triton-dejavu] First execution including JIT compilation took 0.24194097518920898s.
-[triton-dejavu] First execution including JIT compilation took 0.21207785606384277s.
-[triton-dejavu] First execution including JIT compilation took 0.19394159317016602s.
-[triton-dejavu] First execution including JIT compilation took 0.24550700187683105s.
-[triton-dejavu] First execution including JIT compilation took 0.21821212768554688s.
-[triton-dejavu] First execution including JIT compilation took 0.18725252151489258s.
-[triton-dejavu] First execution including JIT compilation took 0.26776623725891113s.
-[triton-dejavu] First execution including JIT compilation took 0.20471405982971191s.
-[triton-dejavu] First execution including JIT compilation took 0.20141196250915527s.
-[triton-dejavu] First execution including JIT compilation took 0.26976442337036133s.
-[triton-dejavu] First execution including JIT compilation took 0.24028730392456055s.
-[triton-dejavu] First execution including JIT compilation took 0.23756718635559082s.
-[triton-dejavu] First execution including JIT compilation took 0.2599141597747803s.
-[triton-dejavu] First execution including JIT compilation took 0.23916363716125488s.
-[triton-dejavu] First execution including JIT compilation took 0.21816468238830566s.
-[triton-dejavu] First execution including JIT compilation took 0.28762292861938477s.
-[triton-dejavu] First execution including JIT compilation took 0.2479848861694336s.
-[triton-dejavu] First execution including JIT compilation took 0.25420284271240234s.
-[triton-dejavu] First execution including JIT compilation took 0.2362511157989502s.
-[triton-dejavu] First execution including JIT compilation took 0.18312764167785645s.
-[triton-dejavu] First execution including JIT compilation took 0.17608380317687988s.
-[triton-dejavu] First execution including JIT compilation took 0.2786374092102051s.
-[triton-dejavu] First execution including JIT compilation took 0.21152758598327637s.
-[triton-dejavu] First execution including JIT compilation took 0.20641374588012695s.
-[triton-dejavu] First execution including JIT compilation took 0.30803728103637695s.
-[triton-dejavu] First execution including JIT compilation took 0.23598504066467285s.
-[triton-dejavu] First execution including JIT compilation took 0.2227318286895752s.
-[triton-dejavu] First execution including JIT compilation took 0.3432927131652832s.
-[triton-dejavu] First execution including JIT compilation took 0.22769927978515625s.
-[triton-dejavu] First execution including JIT compilation took 0.20647501945495605s.
-[triton-dejavu] First execution including JIT compilation took 0.3485453128814697s.
-[triton-dejavu] First execution including JIT compilation took 0.2762014865875244s.
-[triton-dejavu] First execution including JIT compilation took 0.21726274490356445s.
-[triton-dejavu] First execution including JIT compilation took 0.32701706886291504s.
-[triton-dejavu] First execution including JIT compilation took 0.24490046501159668s.
-[triton-dejavu] First execution including JIT compilation took 0.2208249568939209s.
-[triton-dejavu] First execution including JIT compilation took 0.36136794090270996s.
-[triton-dejavu] First execution including JIT compilation took 0.3137195110321045s.
-[triton-dejavu] First execution including JIT compilation took 0.26834893226623535s.
-[triton-dejavu] First execution including JIT compilation took 0.32502317428588867s.
-[triton-dejavu] First execution including JIT compilation took 0.21649813652038574s.
-[triton-dejavu] First execution including JIT compilation took 0.18822789192199707s.
-[triton-dejavu] First execution including JIT compilation took 0.34781932830810547s.
-[triton-dejavu] First execution including JIT compilation took 0.25492358207702637s.
-[triton-dejavu] First execution including JIT compilation took 0.21149992942810059s.
-[triton-dejavu] First execution including JIT compilation took 0.41837024688720703s.
-[triton-dejavu] First execution including JIT compilation took 0.2709987163543701s.
-[triton-dejavu] First execution including JIT compilation took 0.22152233123779297s.
-[triton-dejavu] First execution including JIT compilation took 0.46758460998535156s.
-[triton-dejavu] First execution including JIT compilation took 0.2976984977722168s.
-[triton-dejavu] First execution including JIT compilation took 0.2336409091949463s.
-[triton-dejavu] First execution including JIT compilation took 0.42842841148376465s.
-[triton-dejavu] First execution including JIT compilation took 0.30059170722961426s.
-[triton-dejavu] First execution including JIT compilation took 0.25075721740722656s.
-[triton-dejavu] First execution including JIT compilation took 0.4862644672393799s.
-[triton-dejavu] First execution including JIT compilation took 0.32674360275268555s.
-[triton-dejavu] First execution including JIT compilation took 0.3176698684692383s.
-[triton-dejavu] First execution including JIT compilation took 0.6764540672302246s.
-[triton-dejavu] First execution including JIT compilation took 0.4595639705657959s.
-[triton-dejavu] First execution including JIT compilation took 0.3412759304046631s.
-[triton-dejavu] First execution including JIT compilation took 0.5369167327880859s.
-[triton-dejavu] First execution including JIT compilation took 0.3099100589752197s.
-[triton-dejavu] First execution including JIT compilation took 0.2513244152069092s.
-[triton-dejavu] First execution including JIT compilation took 0.683905839920044s.
-[triton-dejavu] First execution including JIT compilation took 0.3577401638031006s.
-[triton-dejavu] First execution including JIT compilation took 0.29708075523376465s.
-[triton-dejavu] First execution including JIT compilation took 0.8124041557312012s.
-[triton-dejavu] First execution including JIT compilation took 0.3909914493560791s.
-[triton-dejavu] First execution including JIT compilation took 0.32225966453552246s.
-[triton-dejavu] First execution including JIT compilation took 0.875910758972168s.
-[triton-dejavu] First execution including JIT compilation took 0.4234771728515625s.
-[triton-dejavu] First execution including JIT compilation took 0.3409273624420166s.
-[triton-dejavu] First execution including JIT compilation took 0.9268946647644043s.
-[triton-dejavu] First execution including JIT compilation took 0.46335840225219727s.
-[triton-dejavu] First execution including JIT compilation took 0.3612051010131836s.
-[triton-dejavu] First execution including JIT compilation took 0.9951462745666504s.
-[triton-dejavu] First execution including JIT compilation took 0.4895823001861572s.
-[triton-dejavu] First execution including JIT compilation took 0.38950228691101074s.
-[triton-dejavu] First execution including JIT compilation took 1.1150171756744385s.
-[triton-dejavu] First execution including JIT compilation took 0.5509529113769531s.
-[triton-dejavu] First execution including JIT compilation took 0.4379761219024658s.
-[triton-dejavu] First execution including JIT compilation took 0.9682984352111816s.
-[triton-dejavu] First execution including JIT compilation took 0.37152743339538574s.
-[triton-dejavu] First execution including JIT compilation took 0.25163698196411133s.
-[triton-dejavu] First execution including JIT compilation took 1.097111701965332s.
-[triton-dejavu] First execution including JIT compilation took 0.4002962112426758s.
-[triton-dejavu] First execution including JIT compilation took 0.29827260971069336s.
-[triton-dejavu] First execution including JIT compilation took 1.8650331497192383s.
-[triton-dejavu] First execution including JIT compilation took 0.5061264038085938s.
-[triton-dejavu] First execution including JIT compilation took 0.3558540344238281s.
-[triton-dejavu] First execution including JIT compilation took 1.9241220951080322s.
-[triton-dejavu] First execution including JIT compilation took 0.6082024574279785s.
-[triton-dejavu] First execution including JIT compilation took 0.35095739364624023s.
-[triton-dejavu] First execution including JIT compilation took 2.000699758529663s.
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.23331809043884277s.
-[triton-dejavu] First execution including JIT compilation took 0.2007770538330078s.
-[triton-dejavu] First execution including JIT compilation took 0.16995501518249512s.
-[triton-dejavu] First execution including JIT compilation took 0.2849550247192383s.
-[triton-dejavu] First execution including JIT compilation took 0.24909639358520508s.
-[triton-dejavu] First execution including JIT compilation took 0.20572757720947266s.
-[triton-dejavu] First execution including JIT compilation took 0.2925271987915039s.
-[triton-dejavu] First execution including JIT compilation took 0.24988293647766113s.
-[triton-dejavu] First execution including JIT compilation took 0.19756340980529785s.
-[triton-dejavu] First execution including JIT compilation took 0.308469295501709s.
-[triton-dejavu] First execution including JIT compilation took 0.25925660133361816s.
-[triton-dejavu] First execution including JIT compilation took 0.2018594741821289s.
-[triton-dejavu] First execution including JIT compilation took 0.31905412673950195s.
-[triton-dejavu] First execution including JIT compilation took 0.2588214874267578s.
-[triton-dejavu] First execution including JIT compilation took 0.22939348220825195s.
-[triton-dejavu] First execution including JIT compilation took 0.3315243721008301s.
-[triton-dejavu] First execution including JIT compilation took 0.2577550411224365s.
-[triton-dejavu] First execution including JIT compilation took 0.23265576362609863s.
-[triton-dejavu] First execution including JIT compilation took 0.3648536205291748s.
-[triton-dejavu] First execution including JIT compilation took 0.29656481742858887s.
-[triton-dejavu] First execution including JIT compilation took 0.24346494674682617s.
-[triton-dejavu] First execution including JIT compilation took 0.29159116744995117s.
-[triton-dejavu] First execution including JIT compilation took 0.20661616325378418s.
-[triton-dejavu] First execution including JIT compilation took 0.19298219680786133s.
-[triton-dejavu] First execution including JIT compilation took 0.3263542652130127s.
-[triton-dejavu] First execution including JIT compilation took 0.2544429302215576s.
-[triton-dejavu] First execution including JIT compilation took 0.24164104461669922s.
-[triton-dejavu] First execution including JIT compilation took 0.35983991622924805s.
-[triton-dejavu] First execution including JIT compilation took 0.27173733711242676s.
-[triton-dejavu] First execution including JIT compilation took 0.30269622802734375s.
-[triton-dejavu] First execution including JIT compilation took 0.3681807518005371s.
-[triton-dejavu] First execution including JIT compilation took 0.30908918380737305s.
-[triton-dejavu] First execution including JIT compilation took 0.21474623680114746s.
-[triton-dejavu] First execution including JIT compilation took 0.4122345447540283s.
-[triton-dejavu] First execution including JIT compilation took 0.29869675636291504s.
-[triton-dejavu] First execution including JIT compilation took 0.22951626777648926s.
-[triton-dejavu] First execution including JIT compilation took 0.4384334087371826s.
-[triton-dejavu] First execution including JIT compilation took 0.34481048583984375s.
-[triton-dejavu] First execution including JIT compilation took 0.23748016357421875s.
-[triton-dejavu] First execution including JIT compilation took 0.4472684860229492s.
-[triton-dejavu] First execution including JIT compilation took 0.3110086917877197s.
-[triton-dejavu] First execution including JIT compilation took 0.2900521755218506s.
-[triton-dejavu] First execution including JIT compilation took 0.3711414337158203s.
-[triton-dejavu] First execution including JIT compilation took 0.23607397079467773s.
-[triton-dejavu] First execution including JIT compilation took 0.264019250869751s.
-[triton-dejavu] First execution including JIT compilation took 0.7435603141784668s.
-[triton-dejavu] First execution including JIT compilation took 0.44277524948120117s.
-[triton-dejavu] First execution including JIT compilation took 0.21710801124572754s.
-[triton-dejavu] First execution including JIT compilation took 0.4168999195098877s.
-[triton-dejavu] First execution including JIT compilation took 0.3037564754486084s.
-[triton-dejavu] First execution including JIT compilation took 0.23413658142089844s.
-[triton-dejavu] First execution including JIT compilation took 0.5455996990203857s.
-[triton-dejavu] First execution including JIT compilation took 0.38571715354919434s.
-[triton-dejavu] First execution including JIT compilation took 0.31468629837036133s.
-[triton-dejavu] First execution including JIT compilation took 0.9226047992706299s.
-[triton-dejavu] First execution including JIT compilation took 0.5366237163543701s.
-[triton-dejavu] First execution including JIT compilation took 0.33862876892089844s.
-[triton-dejavu] First execution including JIT compilation took 0.7460176944732666s.
-[triton-dejavu] First execution including JIT compilation took 0.5355172157287598s.
-[triton-dejavu] First execution including JIT compilation took 0.3547065258026123s.
-[triton-dejavu] First execution including JIT compilation took 0.7944064140319824s.
-[triton-dejavu] First execution including JIT compilation took 0.5351183414459229s.
-[triton-dejavu] First execution including JIT compilation took 0.38912463188171387s.
-[triton-dejavu] First execution including JIT compilation took 0.6645946502685547s.
-[triton-dejavu] First execution including JIT compilation took 0.361285924911499s.
-[triton-dejavu] First execution including JIT compilation took 0.26433610916137695s.
-[triton-dejavu] First execution including JIT compilation took 0.7722549438476562s.
-[triton-dejavu] First execution including JIT compilation took 0.43912410736083984s.
-[triton-dejavu] First execution including JIT compilation took 0.34244585037231445s.
-[triton-dejavu] First execution including JIT compilation took 0.8954603672027588s.
-[triton-dejavu] First execution including JIT compilation took 0.4297215938568115s.
-[triton-dejavu] First execution including JIT compilation took 0.29612112045288086s.
-[triton-dejavu] First execution including JIT compilation took 0.8536269664764404s.
-[triton-dejavu] First execution including JIT compilation took 0.4335141181945801s.
-[triton-dejavu] First execution including JIT compilation took 0.3198516368865967s.
-[triton-dejavu] First execution including JIT compilation took 0.9171550273895264s.
-[triton-dejavu] First execution including JIT compilation took 0.49994635581970215s.
-[triton-dejavu] First execution including JIT compilation took 0.3632478713989258s.
-[triton-dejavu] First execution including JIT compilation took 0.9547479152679443s.
-[triton-dejavu] First execution including JIT compilation took 0.6412930488586426s.
-[triton-dejavu] First execution including JIT compilation took 0.38673996925354004s.
-bench_cudagraph failed with out of resource: shared memory, Required: 240128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 240128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 272896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 272896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 272896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 272896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.1171579360961914s.
-[triton-dejavu] First execution including JIT compilation took 0.4610905647277832s.
-[triton-dejavu] First execution including JIT compilation took 0.29224681854248047s.
-[triton-dejavu] First execution including JIT compilation took 1.3466551303863525s.
-[triton-dejavu] First execution including JIT compilation took 0.5738677978515625s.
-[triton-dejavu] First execution including JIT compilation took 0.39911484718322754s.
-[triton-dejavu] First execution including JIT compilation took 2.352712631225586s.
-[triton-dejavu] First execution including JIT compilation took 0.7405276298522949s.
-[triton-dejavu] First execution including JIT compilation took 0.3971683979034424s.
-[triton-dejavu] First execution including JIT compilation took 2.751913070678711s.
-bench_cudagraph failed with out of resource: shared memory, Required: 271360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 271360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 271360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 271360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 274432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 274432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 343040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 343040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 408576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 408576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 408576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 408576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 480256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 480256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 545792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 545792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 545792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 545792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.33484816551208496s.
-[triton-dejavu] First execution including JIT compilation took 0.2522711753845215s.
-[triton-dejavu] First execution including JIT compilation took 0.2160511016845703s.
-[triton-dejavu] First execution including JIT compilation took 0.39322805404663086s.
-[triton-dejavu] First execution including JIT compilation took 0.30945301055908203s.
-[triton-dejavu] First execution including JIT compilation took 0.23270702362060547s.
-[triton-dejavu] First execution including JIT compilation took 0.40409421920776367s.
-[triton-dejavu] First execution including JIT compilation took 0.29307126998901367s.
-[triton-dejavu] First execution including JIT compilation took 0.23789381980895996s.
-[triton-dejavu] First execution including JIT compilation took 0.4281790256500244s.
-[triton-dejavu] First execution including JIT compilation took 0.3268117904663086s.
-[triton-dejavu] First execution including JIT compilation took 0.23137664794921875s.
-[triton-dejavu] First execution including JIT compilation took 0.5715954303741455s.
-[triton-dejavu] First execution including JIT compilation took 0.34709930419921875s.
-[triton-dejavu] First execution including JIT compilation took 0.2881906032562256s.
-[triton-dejavu] First execution including JIT compilation took 0.5386977195739746s.
-[triton-dejavu] First execution including JIT compilation took 0.3279886245727539s.
-[triton-dejavu] First execution including JIT compilation took 0.2740349769592285s.
-[triton-dejavu] First execution including JIT compilation took 0.7362427711486816s.
-[triton-dejavu] First execution including JIT compilation took 0.3900623321533203s.
-[triton-dejavu] First execution including JIT compilation took 0.5564815998077393s.
-[triton-dejavu] First execution including JIT compilation took 0.43883204460144043s.
-[triton-dejavu] First execution including JIT compilation took 0.29198122024536133s.
-[triton-dejavu] First execution including JIT compilation took 0.22876620292663574s.
-[triton-dejavu] First execution including JIT compilation took 0.5440170764923096s.
-[triton-dejavu] First execution including JIT compilation took 0.31906890869140625s.
-[triton-dejavu] First execution including JIT compilation took 0.24542737007141113s.
-[triton-dejavu] First execution including JIT compilation took 0.5503432750701904s.
-[triton-dejavu] First execution including JIT compilation took 0.5911548137664795s.
-[triton-dejavu] First execution including JIT compilation took 0.2854585647583008s.
-[triton-dejavu] First execution including JIT compilation took 0.5509939193725586s.
-[triton-dejavu] First execution including JIT compilation took 0.35714101791381836s.
-[triton-dejavu] First execution including JIT compilation took 0.32631993293762207s.
-[triton-dejavu] First execution including JIT compilation took 0.7352540493011475s.
-[triton-dejavu] First execution including JIT compilation took 0.4919905662536621s.
-[triton-dejavu] First execution including JIT compilation took 0.3511998653411865s.
-[triton-dejavu] First execution including JIT compilation took 0.869816780090332s.
-[triton-dejavu] First execution including JIT compilation took 0.6908586025238037s.
-[triton-dejavu] First execution including JIT compilation took 0.3677358627319336s.
-[triton-dejavu] First execution including JIT compilation took 1.0730576515197754s.
-[triton-dejavu] First execution including JIT compilation took 0.5986707210540771s.
-[triton-dejavu] First execution including JIT compilation took 0.40021514892578125s.
-[triton-dejavu] First execution including JIT compilation took 0.7205333709716797s.
-[triton-dejavu] First execution including JIT compilation took 0.4558281898498535s.
-[triton-dejavu] First execution including JIT compilation took 0.3011903762817383s.
-[triton-dejavu] First execution including JIT compilation took 0.7565047740936279s.
-[triton-dejavu] First execution including JIT compilation took 0.5372674465179443s.
-[triton-dejavu] First execution including JIT compilation took 0.35286855697631836s.
-[triton-dejavu] First execution including JIT compilation took 0.778130292892456s.
-[triton-dejavu] First execution including JIT compilation took 0.48418164253234863s.
-[triton-dejavu] First execution including JIT compilation took 0.40369200706481934s.
-[triton-dejavu] First execution including JIT compilation took 0.8576066493988037s.
-[triton-dejavu] First execution including JIT compilation took 0.6475625038146973s.
-[triton-dejavu] First execution including JIT compilation took 0.3454623222351074s.
-[triton-dejavu] First execution including JIT compilation took 0.8133499622344971s.
-[triton-dejavu] First execution including JIT compilation took 0.6029653549194336s.
-[triton-dejavu] First execution including JIT compilation took 0.36894989013671875s.
-[triton-dejavu] First execution including JIT compilation took 0.8726181983947754s.
-[triton-dejavu] First execution including JIT compilation took 0.5887446403503418s.
-[triton-dejavu] First execution including JIT compilation took 0.38448596000671387s.
-[triton-dejavu] First execution including JIT compilation took 1.022134780883789s.
-[triton-dejavu] First execution including JIT compilation took 0.67586350440979s.
-[triton-dejavu] First execution including JIT compilation took 0.4078867435455322s.
-[triton-dejavu] First execution including JIT compilation took 1.0992224216461182s.
-[triton-dejavu] First execution including JIT compilation took 0.5207531452178955s.
-[triton-dejavu] First execution including JIT compilation took 0.303997278213501s.
-[triton-dejavu] First execution including JIT compilation took 1.1604199409484863s.
-[triton-dejavu] First execution including JIT compilation took 0.5801262855529785s.
-[triton-dejavu] First execution including JIT compilation took 0.413867712020874s.
-[triton-dejavu] First execution including JIT compilation took 1.658043384552002s.
-[triton-dejavu] First execution including JIT compilation took 0.6655893325805664s.
-[triton-dejavu] First execution including JIT compilation took 0.4302644729614258s.
-[triton-dejavu] First execution including JIT compilation took 1.9427084922790527s.
-[triton-dejavu] First execution including JIT compilation took 0.8315591812133789s.
-[triton-dejavu] First execution including JIT compilation took 0.5458030700683594s.
-[triton-dejavu] First execution including JIT compilation took 1.938206672668457s.
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.1561598777770996s.
-[triton-dejavu] First execution including JIT compilation took 1.0098638534545898s.
-[triton-dejavu] First execution including JIT compilation took 0.4696693420410156s.
-[triton-dejavu] First execution including JIT compilation took 2.832549571990967s.
-[triton-dejavu] First execution including JIT compilation took 1.2741048336029053s.
-[triton-dejavu] First execution including JIT compilation took 0.5360772609710693s.
-[triton-dejavu] First execution including JIT compilation took 7.192165851593018s.
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 405504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 405504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 709632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 709632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.5908124446868896s.
-[triton-dejavu] First execution including JIT compilation took 0.41149401664733887s.
-[triton-dejavu] First execution including JIT compilation took 0.2559213638305664s.
-[triton-dejavu] First execution including JIT compilation took 0.7076609134674072s.
-[triton-dejavu] First execution including JIT compilation took 0.5295822620391846s.
-[triton-dejavu] First execution including JIT compilation took 0.31011199951171875s.
-[triton-dejavu] First execution including JIT compilation took 0.7750310897827148s.
-[triton-dejavu] First execution including JIT compilation took 0.4493274688720703s.
-[triton-dejavu] First execution including JIT compilation took 0.30690884590148926s.
-[triton-dejavu] First execution including JIT compilation took 0.7551653385162354s.
-[triton-dejavu] First execution including JIT compilation took 0.46668338775634766s.
-[triton-dejavu] First execution including JIT compilation took 0.30584287643432617s.
-[triton-dejavu] First execution including JIT compilation took 0.7725615501403809s.
-[triton-dejavu] First execution including JIT compilation took 0.482954740524292s.
-[triton-dejavu] First execution including JIT compilation took 0.3182220458984375s.
-[triton-dejavu] First execution including JIT compilation took 0.9150772094726562s.
-[triton-dejavu] First execution including JIT compilation took 0.5212767124176025s.
-[triton-dejavu] First execution including JIT compilation took 0.3300950527191162s.
-[triton-dejavu] First execution including JIT compilation took 1.053274393081665s.
-[triton-dejavu] First execution including JIT compilation took 0.5630724430084229s.
-[triton-dejavu] First execution including JIT compilation took 0.3730814456939697s.
-[triton-dejavu] First execution including JIT compilation took 0.79178786277771s.
-[triton-dejavu] First execution including JIT compilation took 0.46175098419189453s.
-[triton-dejavu] First execution including JIT compilation took 0.28571319580078125s.
-[triton-dejavu] First execution including JIT compilation took 0.95066237449646s.
-[triton-dejavu] First execution including JIT compilation took 0.6377534866333008s.
-[triton-dejavu] First execution including JIT compilation took 0.35297322273254395s.
-[triton-dejavu] First execution including JIT compilation took 1.0828590393066406s.
-[triton-dejavu] First execution including JIT compilation took 0.6473112106323242s.
-[triton-dejavu] First execution including JIT compilation took 0.3665587902069092s.
-[triton-dejavu] First execution including JIT compilation took 1.2117927074432373s.
-[triton-dejavu] First execution including JIT compilation took 0.6923372745513916s.
-[triton-dejavu] First execution including JIT compilation took 0.39316701889038086s.
-[triton-dejavu] First execution including JIT compilation took 1.3109822273254395s.
-[triton-dejavu] First execution including JIT compilation took 0.6752035617828369s.
-[triton-dejavu] First execution including JIT compilation took 0.39464235305786133s.
-[triton-dejavu] First execution including JIT compilation took 1.4121100902557373s.
-[triton-dejavu] First execution including JIT compilation took 0.7038490772247314s.
-[triton-dejavu] First execution including JIT compilation took 0.46832728385925293s.
-[triton-dejavu] First execution including JIT compilation took 1.6874134540557861s.
-[triton-dejavu] First execution including JIT compilation took 0.8371837139129639s.
-[triton-dejavu] First execution including JIT compilation took 0.46314477920532227s.
-[triton-dejavu] First execution including JIT compilation took 1.2351336479187012s.
-[triton-dejavu] First execution including JIT compilation took 0.732062816619873s.
-[triton-dejavu] First execution including JIT compilation took 0.3664720058441162s.
-[triton-dejavu] First execution including JIT compilation took 1.3951785564422607s.
-[triton-dejavu] First execution including JIT compilation took 0.8160102367401123s.
-[triton-dejavu] First execution including JIT compilation took 0.4101998805999756s.
-[triton-dejavu] First execution including JIT compilation took 1.886866569519043s.
-[triton-dejavu] First execution including JIT compilation took 0.9539880752563477s.
-[triton-dejavu] First execution including JIT compilation took 0.4695587158203125s.
-[triton-dejavu] First execution including JIT compilation took 2.1268863677978516s.
-[triton-dejavu] First execution including JIT compilation took 1.127213716506958s.
-[triton-dejavu] First execution including JIT compilation took 0.5213239192962646s.
-[triton-dejavu] First execution including JIT compilation took 2.084219217300415s.
-[triton-dejavu] First execution including JIT compilation took 1.8083431720733643s.
-[triton-dejavu] First execution including JIT compilation took 0.7044923305511475s.
-[triton-dejavu] First execution including JIT compilation took 2.561204195022583s.
-bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 292096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 292096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.7049872875213623s.
-[triton-dejavu] First execution including JIT compilation took 1.3836045265197754s.
-[triton-dejavu] First execution including JIT compilation took 0.6024000644683838s.
-[triton-dejavu] First execution including JIT compilation took 3.13523006439209s.
-[triton-dejavu] First execution including JIT compilation took 2.1513431072235107s.
-[triton-dejavu] First execution including JIT compilation took 0.8262593746185303s.
-[triton-dejavu] First execution including JIT compilation took 7.682474613189697s.
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 584192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 584192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 7.4601991176605225s.
-[triton-dejavu] First execution including JIT compilation took 4.111113786697388s.
-[triton-dejavu] First execution including JIT compilation took 0.9548518657684326s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 500736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 500736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 834560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 834560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1168384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1168384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.22535085678100586s.
-[triton-dejavu] First execution including JIT compilation took 0.2475295066833496s.
-[triton-dejavu] First execution including JIT compilation took 0.16012358665466309s.
-[triton-dejavu] First execution including JIT compilation took 0.22483301162719727s.
-[triton-dejavu] First execution including JIT compilation took 0.21385669708251953s.
-[triton-dejavu] First execution including JIT compilation took 0.19730210304260254s.
-[triton-dejavu] First execution including JIT compilation took 0.2612948417663574s.
-[triton-dejavu] First execution including JIT compilation took 0.2422807216644287s.
-[triton-dejavu] First execution including JIT compilation took 0.2098245620727539s.
-[triton-dejavu] First execution including JIT compilation took 0.27469491958618164s.
-[triton-dejavu] First execution including JIT compilation took 0.23059582710266113s.
-[triton-dejavu] First execution including JIT compilation took 0.20798087120056152s.
-[triton-dejavu] First execution including JIT compilation took 0.27477025985717773s.
-[triton-dejavu] First execution including JIT compilation took 0.22785329818725586s.
-[triton-dejavu] First execution including JIT compilation took 0.22185730934143066s.
-[triton-dejavu] First execution including JIT compilation took 0.28423380851745605s.
-[triton-dejavu] First execution including JIT compilation took 0.2566359043121338s.
-[triton-dejavu] First execution including JIT compilation took 0.22713565826416016s.
-[triton-dejavu] First execution including JIT compilation took 0.3106961250305176s.
-[triton-dejavu] First execution including JIT compilation took 0.27785158157348633s.
-[triton-dejavu] First execution including JIT compilation took 0.24365854263305664s.
-[triton-dejavu] First execution including JIT compilation took 0.26142382621765137s.
-[triton-dejavu] First execution including JIT compilation took 0.19252419471740723s.
-[triton-dejavu] First execution including JIT compilation took 0.1860034465789795s.
-[triton-dejavu] First execution including JIT compilation took 0.2948489189147949s.
-[triton-dejavu] First execution including JIT compilation took 0.22476696968078613s.
-[triton-dejavu] First execution including JIT compilation took 0.26130104064941406s.
-[triton-dejavu] First execution including JIT compilation took 0.40126538276672363s.
-[triton-dejavu] First execution including JIT compilation took 0.29898667335510254s.
-[triton-dejavu] First execution including JIT compilation took 0.27327704429626465s.
-[triton-dejavu] First execution including JIT compilation took 0.41613197326660156s.
-[triton-dejavu] First execution including JIT compilation took 0.30518221855163574s.
-[triton-dejavu] First execution including JIT compilation took 0.28211188316345215s.
-[triton-dejavu] First execution including JIT compilation took 0.4438753128051758s.
-[triton-dejavu] First execution including JIT compilation took 0.3301517963409424s.
-[triton-dejavu] First execution including JIT compilation took 0.29430127143859863s.
-[triton-dejavu] First execution including JIT compilation took 0.4660637378692627s.
-[triton-dejavu] First execution including JIT compilation took 0.3478364944458008s.
-[triton-dejavu] First execution including JIT compilation took 0.30316758155822754s.
-[triton-dejavu] First execution including JIT compilation took 0.5299293994903564s.
-[triton-dejavu] First execution including JIT compilation took 0.3778262138366699s.
-[triton-dejavu] First execution including JIT compilation took 0.33774375915527344s.
-[triton-dejavu] First execution including JIT compilation took 0.4039268493652344s.
-[triton-dejavu] First execution including JIT compilation took 0.26682138442993164s.
-[triton-dejavu] First execution including JIT compilation took 0.22738170623779297s.
-[triton-dejavu] First execution including JIT compilation took 0.47388482093811035s.
-[triton-dejavu] First execution including JIT compilation took 0.3087284564971924s.
-[triton-dejavu] First execution including JIT compilation took 0.2632722854614258s.
-[triton-dejavu] First execution including JIT compilation took 0.5885961055755615s.
-[triton-dejavu] First execution including JIT compilation took 0.34126925468444824s.
-[triton-dejavu] First execution including JIT compilation took 0.29026103019714355s.
-[triton-dejavu] First execution including JIT compilation took 0.653205394744873s.
-[triton-dejavu] First execution including JIT compilation took 0.37291741371154785s.
-[triton-dejavu] First execution including JIT compilation took 0.30518603324890137s.
-[triton-dejavu] First execution including JIT compilation took 0.7140650749206543s.
-[triton-dejavu] First execution including JIT compilation took 0.4112815856933594s.
-[triton-dejavu] First execution including JIT compilation took 0.3285210132598877s.
-[triton-dejavu] First execution including JIT compilation took 0.7528486251831055s.
-[triton-dejavu] First execution including JIT compilation took 0.4412572383880615s.
-[triton-dejavu] First execution including JIT compilation took 0.3455331325531006s.
-[triton-dejavu] First execution including JIT compilation took 0.8772494792938232s.
-[triton-dejavu] First execution including JIT compilation took 0.4721558094024658s.
-[triton-dejavu] First execution including JIT compilation took 0.3847846984863281s.
-[triton-dejavu] First execution including JIT compilation took 0.6612024307250977s.
-[triton-dejavu] First execution including JIT compilation took 0.3503909111022949s.
-[triton-dejavu] First execution including JIT compilation took 0.25101804733276367s.
-[triton-dejavu] First execution including JIT compilation took 0.8109943866729736s.
-[triton-dejavu] First execution including JIT compilation took 0.38869762420654297s.
-[triton-dejavu] First execution including JIT compilation took 0.2938868999481201s.
-[triton-dejavu] First execution including JIT compilation took 1.3092761039733887s.
-[triton-dejavu] First execution including JIT compilation took 0.49189066886901855s.
-[triton-dejavu] First execution including JIT compilation took 0.3337218761444092s.
-[triton-dejavu] First execution including JIT compilation took 1.4332802295684814s.
-[triton-dejavu] First execution including JIT compilation took 0.509821891784668s.
-[triton-dejavu] First execution including JIT compilation took 0.3853490352630615s.
-[triton-dejavu] First execution including JIT compilation took 1.5309038162231445s.
-[triton-dejavu] First execution including JIT compilation took 0.5586769580841064s.
-[triton-dejavu] First execution including JIT compilation took 0.388033390045166s.
-[triton-dejavu] First execution including JIT compilation took 1.588334083557129s.
-[triton-dejavu] First execution including JIT compilation took 0.49126362800598145s.
-[triton-dejavu] First execution including JIT compilation took 0.34864377975463867s.
-bench_cudagraph failed with out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 305664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 305664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 305664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 305664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.4631285667419434s.
-[triton-dejavu] First execution including JIT compilation took 0.4619269371032715s.
-[triton-dejavu] First execution including JIT compilation took 0.29707860946655273s.
-[triton-dejavu] First execution including JIT compilation took 1.3915040493011475s.
-[triton-dejavu] First execution including JIT compilation took 0.5160026550292969s.
-[triton-dejavu] First execution including JIT compilation took 0.3659961223602295s.
-[triton-dejavu] First execution including JIT compilation took 5.161684989929199s.
-[triton-dejavu] First execution including JIT compilation took 1.0375711917877197s.
-[triton-dejavu] First execution including JIT compilation took 0.3804037570953369s.
-[triton-dejavu] First execution including JIT compilation took 5.337275505065918s.
-bench_cudagraph failed with out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 457728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 457728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 457728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 457728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 611328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 611328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 611328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 611328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.31734514236450195s.
-[triton-dejavu] First execution including JIT compilation took 0.2059774398803711s.
-[triton-dejavu] First execution including JIT compilation took 0.1928870677947998s.
-[triton-dejavu] First execution including JIT compilation took 0.33611416816711426s.
-[triton-dejavu] First execution including JIT compilation took 0.23422932624816895s.
-[triton-dejavu] First execution including JIT compilation took 0.21037578582763672s.
-[triton-dejavu] First execution including JIT compilation took 0.289534330368042s.
-[triton-dejavu] First execution including JIT compilation took 0.24709415435791016s.
-[triton-dejavu] First execution including JIT compilation took 0.2226250171661377s.
-[triton-dejavu] First execution including JIT compilation took 0.3181033134460449s.
-[triton-dejavu] First execution including JIT compilation took 0.2734520435333252s.
-[triton-dejavu] First execution including JIT compilation took 0.2189197540283203s.
-[triton-dejavu] First execution including JIT compilation took 0.3356783390045166s.
-[triton-dejavu] First execution including JIT compilation took 0.2723813056945801s.
-[triton-dejavu] First execution including JIT compilation took 0.2521681785583496s.
-[triton-dejavu] First execution including JIT compilation took 0.32643723487854004s.
-[triton-dejavu] First execution including JIT compilation took 0.278456449508667s.
-[triton-dejavu] First execution including JIT compilation took 0.25096702575683594s.
-[triton-dejavu] First execution including JIT compilation took 0.35304760932922363s.
-[triton-dejavu] First execution including JIT compilation took 0.29747843742370605s.
-[triton-dejavu] First execution including JIT compilation took 0.2503805160522461s.
-[triton-dejavu] First execution including JIT compilation took 0.3057088851928711s.
-[triton-dejavu] First execution including JIT compilation took 0.2160186767578125s.
-[triton-dejavu] First execution including JIT compilation took 0.1883528232574463s.
-[triton-dejavu] First execution including JIT compilation took 0.3313779830932617s.
-[triton-dejavu] First execution including JIT compilation took 0.24627685546875s.
-[triton-dejavu] First execution including JIT compilation took 0.201185941696167s.
-[triton-dejavu] First execution including JIT compilation took 0.3443264961242676s.
-[triton-dejavu] First execution including JIT compilation took 0.2596099376678467s.
-[triton-dejavu] First execution including JIT compilation took 0.23357057571411133s.
-[triton-dejavu] First execution including JIT compilation took 0.42798876762390137s.
-[triton-dejavu] First execution including JIT compilation took 0.30511474609375s.
-[triton-dejavu] First execution including JIT compilation took 0.24922823905944824s.
-[triton-dejavu] First execution including JIT compilation took 0.4275035858154297s.
-[triton-dejavu] First execution including JIT compilation took 0.3170912265777588s.
-[triton-dejavu] First execution including JIT compilation took 0.25102734565734863s.
-[triton-dejavu] First execution including JIT compilation took 0.4548606872558594s.
-[triton-dejavu] First execution including JIT compilation took 0.2932870388031006s.
-[triton-dejavu] First execution including JIT compilation took 0.25251173973083496s.
-[triton-dejavu] First execution including JIT compilation took 0.5132782459259033s.
-[triton-dejavu] First execution including JIT compilation took 0.3854689598083496s.
-[triton-dejavu] First execution including JIT compilation took 0.276400089263916s.
-[triton-dejavu] First execution including JIT compilation took 0.3926353454589844s.
-[triton-dejavu] First execution including JIT compilation took 0.24996232986450195s.
-[triton-dejavu] First execution including JIT compilation took 0.21382498741149902s.
-[triton-dejavu] First execution including JIT compilation took 0.4578080177307129s.
-[triton-dejavu] First execution including JIT compilation took 0.29611897468566895s.
-[triton-dejavu] First execution including JIT compilation took 0.2173306941986084s.
-[triton-dejavu] First execution including JIT compilation took 0.548072099685669s.
-[triton-dejavu] First execution including JIT compilation took 0.33872079849243164s.
-[triton-dejavu] First execution including JIT compilation took 0.23550057411193848s.
-[triton-dejavu] First execution including JIT compilation took 0.5951023101806641s.
-[triton-dejavu] First execution including JIT compilation took 0.349484920501709s.
-[triton-dejavu] First execution including JIT compilation took 0.25032520294189453s.
-[triton-dejavu] First execution including JIT compilation took 0.6920459270477295s.
-[triton-dejavu] First execution including JIT compilation took 0.3887183666229248s.
-[triton-dejavu] First execution including JIT compilation took 0.27884531021118164s.
-[triton-dejavu] First execution including JIT compilation took 0.6650998592376709s.
-[triton-dejavu] First execution including JIT compilation took 0.38024473190307617s.
-[triton-dejavu] First execution including JIT compilation took 0.293820858001709s.
-[triton-dejavu] First execution including JIT compilation took 0.7408139705657959s.
-[triton-dejavu] First execution including JIT compilation took 0.45377397537231445s.
-[triton-dejavu] First execution including JIT compilation took 0.3186800479888916s.
-[triton-dejavu] First execution including JIT compilation took 0.7443890571594238s.
-[triton-dejavu] First execution including JIT compilation took 0.3219418525695801s.
-[triton-dejavu] First execution including JIT compilation took 0.23611903190612793s.
-[triton-dejavu] First execution including JIT compilation took 0.7835826873779297s.
-[triton-dejavu] First execution including JIT compilation took 0.3625168800354004s.
-[triton-dejavu] First execution including JIT compilation took 0.2934072017669678s.
-[triton-dejavu] First execution including JIT compilation took 1.3723728656768799s.
-[triton-dejavu] First execution including JIT compilation took 0.5136263370513916s.
-[triton-dejavu] First execution including JIT compilation took 0.3561995029449463s.
-[triton-dejavu] First execution including JIT compilation took 1.3893115520477295s.
-[triton-dejavu] First execution including JIT compilation took 0.4639883041381836s.
-[triton-dejavu] First execution including JIT compilation took 0.32212138175964355s.
-[triton-dejavu] First execution including JIT compilation took 1.5799453258514404s.
-[triton-dejavu] First execution including JIT compilation took 0.519599199295044s.
-[triton-dejavu] First execution including JIT compilation took 0.34169602394104004s.
-[triton-dejavu] First execution including JIT compilation took 1.520521640777588s.
-bench_cudagraph failed with out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 338432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 338432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 338432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 338432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.3502428531646729s.
-[triton-dejavu] First execution including JIT compilation took 0.5446245670318604s.
-[triton-dejavu] First execution including JIT compilation took 0.3224365711212158s.
-[triton-dejavu] First execution including JIT compilation took 1.8596394062042236s.
-[triton-dejavu] First execution including JIT compilation took 0.6688938140869141s.
-[triton-dejavu] First execution including JIT compilation took 0.37556910514831543s.
-[triton-dejavu] First execution including JIT compilation took 5.96744441986084s.
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 336896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 336896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 336896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 336896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 676864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 676864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 676864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 676864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.3655416965484619s.
-[triton-dejavu] First execution including JIT compilation took 0.23999977111816406s.
-[triton-dejavu] First execution including JIT compilation took 0.19980168342590332s.
-[triton-dejavu] First execution including JIT compilation took 0.3963167667388916s.
-[triton-dejavu] First execution including JIT compilation took 0.29250240325927734s.
-[triton-dejavu] First execution including JIT compilation took 0.23019075393676758s.
-[triton-dejavu] First execution including JIT compilation took 0.3897242546081543s.
-[triton-dejavu] First execution including JIT compilation took 0.26331114768981934s.
-[triton-dejavu] First execution including JIT compilation took 0.25720906257629395s.
-[triton-dejavu] First execution including JIT compilation took 0.5272367000579834s.
-[triton-dejavu] First execution including JIT compilation took 0.3620729446411133s.
-[triton-dejavu] First execution including JIT compilation took 0.30249667167663574s.
-[triton-dejavu] First execution including JIT compilation took 0.5528614521026611s.
-[triton-dejavu] First execution including JIT compilation took 0.38202786445617676s.
-[triton-dejavu] First execution including JIT compilation took 0.32003331184387207s.
-[triton-dejavu] First execution including JIT compilation took 0.5820906162261963s.
-[triton-dejavu] First execution including JIT compilation took 0.3516204357147217s.
-[triton-dejavu] First execution including JIT compilation took 0.2539525032043457s.
-[triton-dejavu] First execution including JIT compilation took 0.5132083892822266s.
-[triton-dejavu] First execution including JIT compilation took 0.3485991954803467s.
-[triton-dejavu] First execution including JIT compilation took 0.26761674880981445s.
-[triton-dejavu] First execution including JIT compilation took 0.3998754024505615s.
-[triton-dejavu] First execution including JIT compilation took 0.270932674407959s.
-[triton-dejavu] First execution including JIT compilation took 0.21268010139465332s.
-[triton-dejavu] First execution including JIT compilation took 0.4931457042694092s.
-[triton-dejavu] First execution including JIT compilation took 0.3084697723388672s.
-[triton-dejavu] First execution including JIT compilation took 0.22578716278076172s.
-[triton-dejavu] First execution including JIT compilation took 0.4800398349761963s.
-[triton-dejavu] First execution including JIT compilation took 0.3248765468597412s.
-[triton-dejavu] First execution including JIT compilation took 0.25438714027404785s.
-[triton-dejavu] First execution including JIT compilation took 0.5268030166625977s.
-[triton-dejavu] First execution including JIT compilation took 0.32793354988098145s.
-[triton-dejavu] First execution including JIT compilation took 0.2654423713684082s.
-[triton-dejavu] First execution including JIT compilation took 0.5680561065673828s.
-[triton-dejavu] First execution including JIT compilation took 0.3322784900665283s.
-[triton-dejavu] First execution including JIT compilation took 0.25258684158325195s.
-[triton-dejavu] First execution including JIT compilation took 0.5792534351348877s.
-[triton-dejavu] First execution including JIT compilation took 0.5247256755828857s.
-[triton-dejavu] First execution including JIT compilation took 0.3439359664916992s.
-[triton-dejavu] First execution including JIT compilation took 0.8489353656768799s.
-[triton-dejavu] First execution including JIT compilation took 0.5044565200805664s.
-[triton-dejavu] First execution including JIT compilation took 0.39157629013061523s.
-[triton-dejavu] First execution including JIT compilation took 0.733513355255127s.
-[triton-dejavu] First execution including JIT compilation took 0.38277316093444824s.
-[triton-dejavu] First execution including JIT compilation took 0.2873697280883789s.
-[triton-dejavu] First execution including JIT compilation took 0.8169002532958984s.
-[triton-dejavu] First execution including JIT compilation took 0.3655128479003906s.
-[triton-dejavu] First execution including JIT compilation took 0.26145172119140625s.
-[triton-dejavu] First execution including JIT compilation took 0.8048985004425049s.
-[triton-dejavu] First execution including JIT compilation took 0.40337085723876953s.
-[triton-dejavu] First execution including JIT compilation took 0.2873227596282959s.
-[triton-dejavu] First execution including JIT compilation took 0.7874279022216797s.
-[triton-dejavu] First execution including JIT compilation took 0.4543600082397461s.
-[triton-dejavu] First execution including JIT compilation took 0.30629849433898926s.
-[triton-dejavu] First execution including JIT compilation took 1.1004579067230225s.
-[triton-dejavu] First execution including JIT compilation took 0.595219612121582s.
-[triton-dejavu] First execution including JIT compilation took 0.4115121364593506s.
-[triton-dejavu] First execution including JIT compilation took 1.1447741985321045s.
-[triton-dejavu] First execution including JIT compilation took 0.6449964046478271s.
-[triton-dejavu] First execution including JIT compilation took 0.42902207374572754s.
-[triton-dejavu] First execution including JIT compilation took 1.485217809677124s.
-[triton-dejavu] First execution including JIT compilation took 0.7232568264007568s.
-[triton-dejavu] First execution including JIT compilation took 0.478473424911499s.
-[triton-dejavu] First execution including JIT compilation took 1.249863862991333s.
-[triton-dejavu] First execution including JIT compilation took 0.5245099067687988s.
-[triton-dejavu] First execution including JIT compilation took 0.3504352569580078s.
-[triton-dejavu] First execution including JIT compilation took 1.5189287662506104s.
-[triton-dejavu] First execution including JIT compilation took 0.5968093872070312s.
-[triton-dejavu] First execution including JIT compilation took 0.4099123477935791s.
-[triton-dejavu] First execution including JIT compilation took 2.0371575355529785s.
-[triton-dejavu] First execution including JIT compilation took 0.5784683227539062s.
-[triton-dejavu] First execution including JIT compilation took 0.3639485836029053s.
-[triton-dejavu] First execution including JIT compilation took 2.0350635051727295s.
-[triton-dejavu] First execution including JIT compilation took 0.7109920978546143s.
-[triton-dejavu] First execution including JIT compilation took 0.4702615737915039s.
-[triton-dejavu] First execution including JIT compilation took 1.9885196685791016s.
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.343762159347534s.
-[triton-dejavu] First execution including JIT compilation took 0.7591912746429443s.
-[triton-dejavu] First execution including JIT compilation took 0.41748905181884766s.
-[triton-dejavu] First execution including JIT compilation took 2.6733522415161133s.
-[triton-dejavu] First execution including JIT compilation took 0.8401713371276855s.
-[triton-dejavu] First execution including JIT compilation took 0.44391798973083496s.
-[triton-dejavu] First execution including JIT compilation took 8.486325740814209s.
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 405504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 405504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 709632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 709632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.6625301837921143s.
-[triton-dejavu] First execution including JIT compilation took 0.4150726795196533s.
-[triton-dejavu] First execution including JIT compilation took 0.2472681999206543s.
-[triton-dejavu] First execution including JIT compilation took 0.6982488632202148s.
-[triton-dejavu] First execution including JIT compilation took 0.38659119606018066s.
-[triton-dejavu] First execution including JIT compilation took 0.2670154571533203s.
-[triton-dejavu] First execution including JIT compilation took 0.6746160984039307s.
-[triton-dejavu] First execution including JIT compilation took 0.38513994216918945s.
-[triton-dejavu] First execution including JIT compilation took 0.2773430347442627s.
-[triton-dejavu] First execution including JIT compilation took 0.7035007476806641s.
-[triton-dejavu] First execution including JIT compilation took 0.43756604194641113s.
-[triton-dejavu] First execution including JIT compilation took 0.27396464347839355s.
-[triton-dejavu] First execution including JIT compilation took 0.8079738616943359s.
-[triton-dejavu] First execution including JIT compilation took 0.4706554412841797s.
-[triton-dejavu] First execution including JIT compilation took 0.3690028190612793s.
-[triton-dejavu] First execution including JIT compilation took 1.0047783851623535s.
-[triton-dejavu] First execution including JIT compilation took 0.4361457824707031s.
-[triton-dejavu] First execution including JIT compilation took 0.364422082901001s.
-[triton-dejavu] First execution including JIT compilation took 0.8165838718414307s.
-[triton-dejavu] First execution including JIT compilation took 0.48160290718078613s.
-[triton-dejavu] First execution including JIT compilation took 0.3796987533569336s.
-[triton-dejavu] First execution including JIT compilation took 0.8447437286376953s.
-[triton-dejavu] First execution including JIT compilation took 0.3994133472442627s.
-[triton-dejavu] First execution including JIT compilation took 0.3070847988128662s.
-[triton-dejavu] First execution including JIT compilation took 0.8190915584564209s.
-[triton-dejavu] First execution including JIT compilation took 0.44321155548095703s.
-[triton-dejavu] First execution including JIT compilation took 0.30388951301574707s.
-[triton-dejavu] First execution including JIT compilation took 0.8760182857513428s.
-[triton-dejavu] First execution including JIT compilation took 0.498699426651001s.
-[triton-dejavu] First execution including JIT compilation took 0.33666563034057617s.
-[triton-dejavu] First execution including JIT compilation took 0.9385683536529541s.
-[triton-dejavu] First execution including JIT compilation took 0.49439096450805664s.
-[triton-dejavu] First execution including JIT compilation took 0.34059906005859375s.
-[triton-dejavu] First execution including JIT compilation took 0.9848973751068115s.
-[triton-dejavu] First execution including JIT compilation took 0.5026867389678955s.
-[triton-dejavu] First execution including JIT compilation took 0.3362538814544678s.
-[triton-dejavu] First execution including JIT compilation took 1.054696798324585s.
-[triton-dejavu] First execution including JIT compilation took 0.5362629890441895s.
-[triton-dejavu] First execution including JIT compilation took 0.3406381607055664s.
-[triton-dejavu] First execution including JIT compilation took 1.1980383396148682s.
-[triton-dejavu] First execution including JIT compilation took 0.6002733707427979s.
-[triton-dejavu] First execution including JIT compilation took 0.4032762050628662s.
-[triton-dejavu] First execution including JIT compilation took 1.0670430660247803s.
-[triton-dejavu] First execution including JIT compilation took 0.5054290294647217s.
-[triton-dejavu] First execution including JIT compilation took 0.31844234466552734s.
-[triton-dejavu] First execution including JIT compilation took 1.20845365524292s.
-[triton-dejavu] First execution including JIT compilation took 0.635533332824707s.
-[triton-dejavu] First execution including JIT compilation took 0.39752840995788574s.
-[triton-dejavu] First execution including JIT compilation took 1.2634165287017822s.
-[triton-dejavu] First execution including JIT compilation took 0.6931250095367432s.
-[triton-dejavu] First execution including JIT compilation took 0.3806438446044922s.
-[triton-dejavu] First execution including JIT compilation took 1.3524491786956787s.
-[triton-dejavu] First execution including JIT compilation took 0.6660432815551758s.
-[triton-dejavu] First execution including JIT compilation took 0.4266016483306885s.
-[triton-dejavu] First execution including JIT compilation took 1.3512389659881592s.
-[triton-dejavu] First execution including JIT compilation took 0.7300617694854736s.
-[triton-dejavu] First execution including JIT compilation took 0.4240868091583252s.
-[triton-dejavu] First execution including JIT compilation took 1.5339932441711426s.
-[triton-dejavu] First execution including JIT compilation took 0.7730631828308105s.
-[triton-dejavu] First execution including JIT compilation took 0.506572961807251s.
-bench_cudagraph failed with out of resource: shared memory, Required: 234752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 234752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 267520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 267520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 267520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 267520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.9073479175567627s.
-[triton-dejavu] First execution including JIT compilation took 0.7516419887542725s.
-[triton-dejavu] First execution including JIT compilation took 0.49443531036376953s.
-[triton-dejavu] First execution including JIT compilation took 2.24280047416687s.
-[triton-dejavu] First execution including JIT compilation took 0.8264782428741455s.
-[triton-dejavu] First execution including JIT compilation took 0.49369287490844727s.
-[triton-dejavu] First execution including JIT compilation took 3.268693208694458s.
-[triton-dejavu] First execution including JIT compilation took 0.9770853519439697s.
-[triton-dejavu] First execution including JIT compilation took 0.5697095394134521s.
-[triton-dejavu] First execution including JIT compilation took 3.2506027221679688s.
-bench_cudagraph failed with out of resource: shared memory, Required: 266752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 266752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 400896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 400896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 469504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 469504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 535040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 535040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 535040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 535040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.055661916732788s.
-[triton-dejavu] First execution including JIT compilation took 1.567265272140503s.
-[triton-dejavu] First execution including JIT compilation took 0.6991770267486572s.
-[triton-dejavu] First execution including JIT compilation took 4.434000730514526s.
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 533504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 533504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 533504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 533504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 536576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 536576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 670720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 670720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 801792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 801792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 801792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 801792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 939008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 939008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1070080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1070080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1070080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1070080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.2387633323669434s.
-[triton-dejavu] First execution including JIT compilation took 0.6717429161071777s.
-[triton-dejavu] First execution including JIT compilation took 0.4157521724700928s.
-[triton-dejavu] First execution including JIT compilation took 1.316270351409912s.
-[triton-dejavu] First execution including JIT compilation took 0.7596695423126221s.
-[triton-dejavu] First execution including JIT compilation took 0.4030904769897461s.
-[triton-dejavu] First execution including JIT compilation took 1.4970901012420654s.
-[triton-dejavu] First execution including JIT compilation took 0.9932441711425781s.
-[triton-dejavu] First execution including JIT compilation took 0.4329719543457031s.
-[triton-dejavu] First execution including JIT compilation took 1.4383361339569092s.
-[triton-dejavu] First execution including JIT compilation took 0.7822005748748779s.
-[triton-dejavu] First execution including JIT compilation took 0.5704188346862793s.
-[triton-dejavu] First execution including JIT compilation took 1.9332118034362793s.
-[triton-dejavu] First execution including JIT compilation took 1.031646728515625s.
-[triton-dejavu] First execution including JIT compilation took 0.5641162395477295s.
-[triton-dejavu] First execution including JIT compilation took 1.697702407836914s.
-[triton-dejavu] First execution including JIT compilation took 0.82403564453125s.
-[triton-dejavu] First execution including JIT compilation took 0.49923229217529297s.
-[triton-dejavu] First execution including JIT compilation took 1.6510090827941895s.
-[triton-dejavu] First execution including JIT compilation took 0.8686649799346924s.
-[triton-dejavu] First execution including JIT compilation took 0.49845027923583984s.
-[triton-dejavu] First execution including JIT compilation took 1.5030395984649658s.
-[triton-dejavu] First execution including JIT compilation took 1.132683277130127s.
-[triton-dejavu] First execution including JIT compilation took 0.5156295299530029s.
-[triton-dejavu] First execution including JIT compilation took 1.8180909156799316s.
-[triton-dejavu] First execution including JIT compilation took 0.8981871604919434s.
-[triton-dejavu] First execution including JIT compilation took 0.46332740783691406s.
-[triton-dejavu] First execution including JIT compilation took 2.3511245250701904s.
-[triton-dejavu] First execution including JIT compilation took 1.3827064037322998s.
-[triton-dejavu] First execution including JIT compilation took 0.6484944820404053s.
-[triton-dejavu] First execution including JIT compilation took 2.287813663482666s.
-[triton-dejavu] First execution including JIT compilation took 1.1250503063201904s.
-[triton-dejavu] First execution including JIT compilation took 0.5330066680908203s.
-[triton-dejavu] First execution including JIT compilation took 2.089167594909668s.
-[triton-dejavu] First execution including JIT compilation took 1.1511783599853516s.
-[triton-dejavu] First execution including JIT compilation took 0.5573456287384033s.
-[triton-dejavu] First execution including JIT compilation took 2.3208694458007812s.
-[triton-dejavu] First execution including JIT compilation took 1.2315797805786133s.
-[triton-dejavu] First execution including JIT compilation took 0.6272659301757812s.
-[triton-dejavu] First execution including JIT compilation took 2.580502510070801s.
-[triton-dejavu] First execution including JIT compilation took 1.3232295513153076s.
-[triton-dejavu] First execution including JIT compilation took 0.6254336833953857s.
-[triton-dejavu] First execution including JIT compilation took 2.4162003993988037s.
-[triton-dejavu] First execution including JIT compilation took 1.2951240539550781s.
-[triton-dejavu] First execution including JIT compilation took 0.6093685626983643s.
-[triton-dejavu] First execution including JIT compilation took 2.4826173782348633s.
-[triton-dejavu] First execution including JIT compilation took 1.444998025894165s.
-[triton-dejavu] First execution including JIT compilation took 0.6317059993743896s.
-[triton-dejavu] First execution including JIT compilation took 3.2404682636260986s.
-[triton-dejavu] First execution including JIT compilation took 1.592949628829956s.
-[triton-dejavu] First execution including JIT compilation took 0.7632882595062256s.
-[triton-dejavu] First execution including JIT compilation took 3.501128911972046s.
-[triton-dejavu] First execution including JIT compilation took 1.8167932033538818s.
-[triton-dejavu] First execution including JIT compilation took 0.7571108341217041s.
-[triton-dejavu] First execution including JIT compilation took 3.3915903568267822s.
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 349440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 349440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.791898488998413s.
-[triton-dejavu] First execution including JIT compilation took 2.434124231338501s.
-[triton-dejavu] First execution including JIT compilation took 1.172961711883545s.
-[triton-dejavu] First execution including JIT compilation took 5.012480020523071s.
-[triton-dejavu] First execution including JIT compilation took 2.740521192550659s.
-[triton-dejavu] First execution including JIT compilation took 1.2781014442443848s.
-[triton-dejavu] First execution including JIT compilation took 9.610878944396973s.
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 698880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 698880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 7.976198434829712s.
-[triton-dejavu] First execution including JIT compilation took 3.3016257286071777s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 798720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 798720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1397760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1397760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.31671142578125s.
-[triton-dejavu] First execution including JIT compilation took 0.22868680953979492s.
-[triton-dejavu] First execution including JIT compilation took 0.18102025985717773s.
-[triton-dejavu] First execution including JIT compilation took 0.30118656158447266s.
-[triton-dejavu] First execution including JIT compilation took 0.23821210861206055s.
-[triton-dejavu] First execution including JIT compilation took 0.20787501335144043s.
-[triton-dejavu] First execution including JIT compilation took 0.32696962356567383s.
-[triton-dejavu] First execution including JIT compilation took 0.2887406349182129s.
-[triton-dejavu] First execution including JIT compilation took 0.21834921836853027s.
-[triton-dejavu] First execution including JIT compilation took 0.3684656620025635s.
-[triton-dejavu] First execution including JIT compilation took 0.2897188663482666s.
-[triton-dejavu] First execution including JIT compilation took 0.21413230895996094s.
-[triton-dejavu] First execution including JIT compilation took 0.4073657989501953s.
-[triton-dejavu] First execution including JIT compilation took 0.29150938987731934s.
-[triton-dejavu] First execution including JIT compilation took 0.24649262428283691s.
-[triton-dejavu] First execution including JIT compilation took 0.3873109817504883s.
-[triton-dejavu] First execution including JIT compilation took 0.3143148422241211s.
-[triton-dejavu] First execution including JIT compilation took 0.25432682037353516s.
-[triton-dejavu] First execution including JIT compilation took 0.4593379497528076s.
-[triton-dejavu] First execution including JIT compilation took 0.3326547145843506s.
-[triton-dejavu] First execution including JIT compilation took 0.27237915992736816s.
-[triton-dejavu] First execution including JIT compilation took 0.37383532524108887s.
-[triton-dejavu] First execution including JIT compilation took 0.2576146125793457s.
-[triton-dejavu] First execution including JIT compilation took 0.2171943187713623s.
-[triton-dejavu] First execution including JIT compilation took 0.3749668598175049s.
-[triton-dejavu] First execution including JIT compilation took 0.3074686527252197s.
-[triton-dejavu] First execution including JIT compilation took 0.2472078800201416s.
-[triton-dejavu] First execution including JIT compilation took 0.4607219696044922s.
-[triton-dejavu] First execution including JIT compilation took 0.2899644374847412s.
-[triton-dejavu] First execution including JIT compilation took 0.29875612258911133s.
-[triton-dejavu] First execution including JIT compilation took 0.59027099609375s.
-[triton-dejavu] First execution including JIT compilation took 0.333834171295166s.
-[triton-dejavu] First execution including JIT compilation took 0.26287293434143066s.
-[triton-dejavu] First execution including JIT compilation took 0.6803445816040039s.
-[triton-dejavu] First execution including JIT compilation took 0.3831348419189453s.
-[triton-dejavu] First execution including JIT compilation took 0.30806612968444824s.
-[triton-dejavu] First execution including JIT compilation took 0.5800254344940186s.
-[triton-dejavu] First execution including JIT compilation took 0.42940402030944824s.
-[triton-dejavu] First execution including JIT compilation took 0.3212895393371582s.
-[triton-dejavu] First execution including JIT compilation took 0.6800441741943359s.
-[triton-dejavu] First execution including JIT compilation took 0.4293978214263916s.
-[triton-dejavu] First execution including JIT compilation took 0.31106042861938477s.
-[triton-dejavu] First execution including JIT compilation took 0.5731973648071289s.
-[triton-dejavu] First execution including JIT compilation took 0.3158423900604248s.
-[triton-dejavu] First execution including JIT compilation took 0.21982431411743164s.
-[triton-dejavu] First execution including JIT compilation took 0.6607396602630615s.
-[triton-dejavu] First execution including JIT compilation took 0.33777403831481934s.
-[triton-dejavu] First execution including JIT compilation took 0.26319289207458496s.
-[triton-dejavu] First execution including JIT compilation took 1.0609164237976074s.
-[triton-dejavu] First execution including JIT compilation took 0.42955660820007324s.
-[triton-dejavu] First execution including JIT compilation took 0.29787397384643555s.
-[triton-dejavu] First execution including JIT compilation took 1.1868953704833984s.
-[triton-dejavu] First execution including JIT compilation took 0.44803476333618164s.
-[triton-dejavu] First execution including JIT compilation took 0.31003737449645996s.
-[triton-dejavu] First execution including JIT compilation took 1.2543339729309082s.
-[triton-dejavu] First execution including JIT compilation took 0.5254006385803223s.
-[triton-dejavu] First execution including JIT compilation took 0.32144618034362793s.
-[triton-dejavu] First execution including JIT compilation took 1.3773908615112305s.
-[triton-dejavu] First execution including JIT compilation took 0.5259160995483398s.
-[triton-dejavu] First execution including JIT compilation took 0.34156179428100586s.
-bench_cudagraph failed with out of resource: shared memory, Required: 249088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 283904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 283904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.1119396686553955s.
-[triton-dejavu] First execution including JIT compilation took 0.4642174243927002s.
-[triton-dejavu] First execution including JIT compilation took 0.2669200897216797s.
-[triton-dejavu] First execution including JIT compilation took 1.3371021747589111s.
-[triton-dejavu] First execution including JIT compilation took 0.49892330169677734s.
-[triton-dejavu] First execution including JIT compilation took 0.318439245223999s.
-[triton-dejavu] First execution including JIT compilation took 4.929638385772705s.
-[triton-dejavu] First execution including JIT compilation took 0.9740848541259766s.
-[triton-dejavu] First execution including JIT compilation took 0.37487220764160156s.
-[triton-dejavu] First execution including JIT compilation took 5.154336214065552s.
-bench_cudagraph failed with out of resource: shared memory, Required: 283136, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283136, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 283136, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283136, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 425472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 425472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 425472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 425472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 567808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 567808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 567808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 567808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.1173014640808105s.
-[triton-dejavu] First execution including JIT compilation took 1.0967254638671875s.
-[triton-dejavu] First execution including JIT compilation took 0.48938989639282227s.
-[triton-dejavu] First execution including JIT compilation took 3.199140787124634s.
-bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 566272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 566272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 566272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 566272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 850944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 850944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 850944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 850944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1135616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1135616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1135616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1135616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.5043489933013916s.
-[triton-dejavu] First execution including JIT compilation took 0.3277888298034668s.
-[triton-dejavu] First execution including JIT compilation took 0.24386119842529297s.
-[triton-dejavu] First execution including JIT compilation took 0.5100774765014648s.
-[triton-dejavu] First execution including JIT compilation took 0.34099698066711426s.
-[triton-dejavu] First execution including JIT compilation took 0.2779710292816162s.
-[triton-dejavu] First execution including JIT compilation took 0.5361578464508057s.
-[triton-dejavu] First execution including JIT compilation took 0.3682215213775635s.
-[triton-dejavu] First execution including JIT compilation took 0.300004243850708s.
-[triton-dejavu] First execution including JIT compilation took 0.5743675231933594s.
-[triton-dejavu] First execution including JIT compilation took 0.3822929859161377s.
-[triton-dejavu] First execution including JIT compilation took 0.3132593631744385s.
-[triton-dejavu] First execution including JIT compilation took 0.600147008895874s.
-[triton-dejavu] First execution including JIT compilation took 0.406186580657959s.
-[triton-dejavu] First execution including JIT compilation took 0.328277587890625s.
-[triton-dejavu] First execution including JIT compilation took 0.6361839771270752s.
-[triton-dejavu] First execution including JIT compilation took 0.4613196849822998s.
-[triton-dejavu] First execution including JIT compilation took 0.34471893310546875s.
-[triton-dejavu] First execution including JIT compilation took 0.7103567123413086s.
-[triton-dejavu] First execution including JIT compilation took 0.46198368072509766s.
-[triton-dejavu] First execution including JIT compilation took 0.37233877182006836s.
-[triton-dejavu] First execution including JIT compilation took 0.5790450572967529s.
-[triton-dejavu] First execution including JIT compilation took 0.33751773834228516s.
-[triton-dejavu] First execution including JIT compilation took 0.26027369499206543s.
-[triton-dejavu] First execution including JIT compilation took 0.6138906478881836s.
-[triton-dejavu] First execution including JIT compilation took 0.4593079090118408s.
-[triton-dejavu] First execution including JIT compilation took 0.29265832901000977s.
-[triton-dejavu] First execution including JIT compilation took 0.6543664932250977s.
-[triton-dejavu] First execution including JIT compilation took 0.40736937522888184s.
-[triton-dejavu] First execution including JIT compilation took 0.2809460163116455s.
-[triton-dejavu] First execution including JIT compilation took 0.7575297355651855s.
-[triton-dejavu] First execution including JIT compilation took 0.33360981941223145s.
-[triton-dejavu] First execution including JIT compilation took 0.31966447830200195s.
-[triton-dejavu] First execution including JIT compilation took 0.8111255168914795s.
-[triton-dejavu] First execution including JIT compilation took 0.46427249908447266s.
-[triton-dejavu] First execution including JIT compilation took 0.34816551208496094s.
-[triton-dejavu] First execution including JIT compilation took 0.9121909141540527s.
-[triton-dejavu] First execution including JIT compilation took 0.49946069717407227s.
-[triton-dejavu] First execution including JIT compilation took 0.3831624984741211s.
-[triton-dejavu] First execution including JIT compilation took 1.057861328125s.
-[triton-dejavu] First execution including JIT compilation took 0.5552551746368408s.
-[triton-dejavu] First execution including JIT compilation took 0.3699655532836914s.
-[triton-dejavu] First execution including JIT compilation took 0.8594727516174316s.
-[triton-dejavu] First execution including JIT compilation took 0.42781662940979004s.
-[triton-dejavu] First execution including JIT compilation took 0.31317567825317383s.
-[triton-dejavu] First execution including JIT compilation took 0.9985849857330322s.
-[triton-dejavu] First execution including JIT compilation took 0.5023458003997803s.
-[triton-dejavu] First execution including JIT compilation took 0.37673401832580566s.
-[triton-dejavu] First execution including JIT compilation took 1.501765251159668s.
-[triton-dejavu] First execution including JIT compilation took 0.5676426887512207s.
-[triton-dejavu] First execution including JIT compilation took 0.3674488067626953s.
-[triton-dejavu] First execution including JIT compilation took 1.6308376789093018s.
-[triton-dejavu] First execution including JIT compilation took 0.6221895217895508s.
-[triton-dejavu] First execution including JIT compilation took 0.42139220237731934s.
-[triton-dejavu] First execution including JIT compilation took 1.6980812549591064s.
-[triton-dejavu] First execution including JIT compilation took 0.6677892208099365s.
-[triton-dejavu] First execution including JIT compilation took 0.4159400463104248s.
-[triton-dejavu] First execution including JIT compilation took 1.8085997104644775s.
-[triton-dejavu] First execution including JIT compilation took 0.7202484607696533s.
-[triton-dejavu] First execution including JIT compilation took 0.4506070613861084s.
-bench_cudagraph failed with out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 300288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 300288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.7220134735107422s.
-[triton-dejavu] First execution including JIT compilation took 0.6867120265960693s.
-[triton-dejavu] First execution including JIT compilation took 0.4079298973083496s.
-[triton-dejavu] First execution including JIT compilation took 1.9503588676452637s.
-[triton-dejavu] First execution including JIT compilation took 0.7429883480072021s.
-[triton-dejavu] First execution including JIT compilation took 0.46311044692993164s.
-[triton-dejavu] First execution including JIT compilation took 5.910313367843628s.
-[triton-dejavu] First execution including JIT compilation took 1.2488391399383545s.
-[triton-dejavu] First execution including JIT compilation took 0.5487070083618164s.
-[triton-dejavu] First execution including JIT compilation took 5.962830066680908s.
-bench_cudagraph failed with out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 450048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 450048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 450048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 450048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 600576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 600576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.886913776397705s.
-[triton-dejavu] First execution including JIT compilation took 1.1668055057525635s.
-[triton-dejavu] First execution including JIT compilation took 0.5872712135314941s.
-[triton-dejavu] First execution including JIT compilation took 4.414681434631348s.
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 749568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 749568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 749568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 749568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 900096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 900096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 900096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 900096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1201152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1201152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1201152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1201152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.7140195369720459s.
-[triton-dejavu] First execution including JIT compilation took 0.36057424545288086s.
-[triton-dejavu] First execution including JIT compilation took 0.25351953506469727s.
-[triton-dejavu] First execution including JIT compilation took 0.6811349391937256s.
-[triton-dejavu] First execution including JIT compilation took 0.39322566986083984s.
-[triton-dejavu] First execution including JIT compilation took 0.26434326171875s.
-[triton-dejavu] First execution including JIT compilation took 0.7010984420776367s.
-[triton-dejavu] First execution including JIT compilation took 0.3896751403808594s.
-[triton-dejavu] First execution including JIT compilation took 0.28528761863708496s.
-[triton-dejavu] First execution including JIT compilation took 0.7317478656768799s.
-[triton-dejavu] First execution including JIT compilation took 0.4223208427429199s.
-[triton-dejavu] First execution including JIT compilation took 0.28917455673217773s.
-[triton-dejavu] First execution including JIT compilation took 0.744981050491333s.
-[triton-dejavu] First execution including JIT compilation took 0.3946702480316162s.
-[triton-dejavu] First execution including JIT compilation took 0.29891490936279297s.
-[triton-dejavu] First execution including JIT compilation took 0.765406608581543s.
-[triton-dejavu] First execution including JIT compilation took 0.44653844833374023s.
-[triton-dejavu] First execution including JIT compilation took 0.33998966217041016s.
-[triton-dejavu] First execution including JIT compilation took 0.8716061115264893s.
-[triton-dejavu] First execution including JIT compilation took 0.4519209861755371s.
-[triton-dejavu] First execution including JIT compilation took 0.348386287689209s.
-[triton-dejavu] First execution including JIT compilation took 1.0358567237854004s.
-[triton-dejavu] First execution including JIT compilation took 0.4894859790802002s.
-[triton-dejavu] First execution including JIT compilation took 0.3279075622558594s.
-[triton-dejavu] First execution including JIT compilation took 1.148808479309082s.
-[triton-dejavu] First execution including JIT compilation took 0.5393466949462891s.
-[triton-dejavu] First execution including JIT compilation took 0.3747735023498535s.
-[triton-dejavu] First execution including JIT compilation took 1.237614631652832s.
-[triton-dejavu] First execution including JIT compilation took 0.5807638168334961s.
-[triton-dejavu] First execution including JIT compilation took 0.3793628215789795s.
-[triton-dejavu] First execution including JIT compilation took 1.323664903640747s.
-[triton-dejavu] First execution including JIT compilation took 0.6247925758361816s.
-[triton-dejavu] First execution including JIT compilation took 0.39437222480773926s.
-[triton-dejavu] First execution including JIT compilation took 1.3928866386413574s.
-[triton-dejavu] First execution including JIT compilation took 0.6385958194732666s.
-[triton-dejavu] First execution including JIT compilation took 0.4033050537109375s.
-[triton-dejavu] First execution including JIT compilation took 1.440335988998413s.
-[triton-dejavu] First execution including JIT compilation took 0.5338249206542969s.
-[triton-dejavu] First execution including JIT compilation took 0.33176136016845703s.
-[triton-dejavu] First execution including JIT compilation took 1.2540497779846191s.
-[triton-dejavu] First execution including JIT compilation took 0.5932145118713379s.
-[triton-dejavu] First execution including JIT compilation took 0.35477566719055176s.
-[triton-dejavu] First execution including JIT compilation took 1.5448269844055176s.
-[triton-dejavu] First execution including JIT compilation took 0.6225264072418213s.
-[triton-dejavu] First execution including JIT compilation took 0.36580657958984375s.
-[triton-dejavu] First execution including JIT compilation took 1.6768112182617188s.
-[triton-dejavu] First execution including JIT compilation took 0.6222131252288818s.
-[triton-dejavu] First execution including JIT compilation took 0.42403435707092285s.
-[triton-dejavu] First execution including JIT compilation took 1.895324945449829s.
-[triton-dejavu] First execution including JIT compilation took 0.6491637229919434s.
-[triton-dejavu] First execution including JIT compilation took 0.49414658546447754s.
-[triton-dejavu] First execution including JIT compilation took 2.1658642292022705s.
-[triton-dejavu] First execution including JIT compilation took 0.9408359527587891s.
-[triton-dejavu] First execution including JIT compilation took 0.4878816604614258s.
-[triton-dejavu] First execution including JIT compilation took 2.3840792179107666s.
-[triton-dejavu] First execution including JIT compilation took 0.8270976543426514s.
-[triton-dejavu] First execution including JIT compilation took 0.41714978218078613s.
-[triton-dejavu] First execution including JIT compilation took 2.040860891342163s.
-bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 292096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 292096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.6213810443878174s.
-[triton-dejavu] First execution including JIT compilation took 0.9269452095031738s.
-[triton-dejavu] First execution including JIT compilation took 0.47311830520629883s.
-[triton-dejavu] First execution including JIT compilation took 2.7501637935638428s.
-[triton-dejavu] First execution including JIT compilation took 0.7629375457763672s.
-[triton-dejavu] First execution including JIT compilation took 0.4511408805847168s.
-[triton-dejavu] First execution including JIT compilation took 7.134660243988037s.
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 584192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 584192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.504925966262817s.
-[triton-dejavu] First execution including JIT compilation took 1.498021125793457s.
-[triton-dejavu] First execution including JIT compilation took 0.7205624580383301s.
-[triton-dejavu] First execution including JIT compilation took 6.149832725524902s.
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 500736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 500736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 834560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 834560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1168384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1168384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.4407587051391602s.
-[triton-dejavu] First execution including JIT compilation took 0.7037980556488037s.
-[triton-dejavu] First execution including JIT compilation took 0.4034006595611572s.
-[triton-dejavu] First execution including JIT compilation took 1.492032527923584s.
-[triton-dejavu] First execution including JIT compilation took 0.8152525424957275s.
-[triton-dejavu] First execution including JIT compilation took 0.3937671184539795s.
-[triton-dejavu] First execution including JIT compilation took 1.604053020477295s.
-[triton-dejavu] First execution including JIT compilation took 0.850435733795166s.
-[triton-dejavu] First execution including JIT compilation took 0.4098947048187256s.
-[triton-dejavu] First execution including JIT compilation took 1.6307311058044434s.
-[triton-dejavu] First execution including JIT compilation took 0.7785723209381104s.
-[triton-dejavu] First execution including JIT compilation took 0.4134221076965332s.
-[triton-dejavu] First execution including JIT compilation took 1.6844274997711182s.
-[triton-dejavu] First execution including JIT compilation took 0.8327662944793701s.
-[triton-dejavu] First execution including JIT compilation took 0.4714367389678955s.
-[triton-dejavu] First execution including JIT compilation took 2.1540465354919434s.
-[triton-dejavu] First execution including JIT compilation took 0.8299758434295654s.
-[triton-dejavu] First execution including JIT compilation took 0.5292987823486328s.
-[triton-dejavu] First execution including JIT compilation took 1.8861517906188965s.
-[triton-dejavu] First execution including JIT compilation took 0.8495028018951416s.
-[triton-dejavu] First execution including JIT compilation took 0.44466233253479004s.
-[triton-dejavu] First execution including JIT compilation took 1.6645276546478271s.
-[triton-dejavu] First execution including JIT compilation took 0.8646507263183594s.
-[triton-dejavu] First execution including JIT compilation took 0.3824446201324463s.
-[triton-dejavu] First execution including JIT compilation took 1.8299148082733154s.
-[triton-dejavu] First execution including JIT compilation took 0.8459672927856445s.
-[triton-dejavu] First execution including JIT compilation took 0.4378163814544678s.
-[triton-dejavu] First execution including JIT compilation took 1.9301397800445557s.
-[triton-dejavu] First execution including JIT compilation took 0.8685927391052246s.
-[triton-dejavu] First execution including JIT compilation took 0.49677586555480957s.
-[triton-dejavu] First execution including JIT compilation took 2.1058199405670166s.
-[triton-dejavu] First execution including JIT compilation took 0.9192090034484863s.
-[triton-dejavu] First execution including JIT compilation took 0.4858896732330322s.
-[triton-dejavu] First execution including JIT compilation took 2.163212776184082s.
-[triton-dejavu] First execution including JIT compilation took 0.9346041679382324s.
-[triton-dejavu] First execution including JIT compilation took 0.516742467880249s.
-[triton-dejavu] First execution including JIT compilation took 2.2323033809661865s.
-[triton-dejavu] First execution including JIT compilation took 1.0213954448699951s.
-[triton-dejavu] First execution including JIT compilation took 0.5393123626708984s.
-[triton-dejavu] First execution including JIT compilation took 2.4064090251922607s.
-[triton-dejavu] First execution including JIT compilation took 1.1745574474334717s.
-[triton-dejavu] First execution including JIT compilation took 0.5458128452301025s.
-[triton-dejavu] First execution including JIT compilation took 2.5389962196350098s.
-[triton-dejavu] First execution including JIT compilation took 1.0145437717437744s.
-[triton-dejavu] First execution including JIT compilation took 0.5042729377746582s.
-[triton-dejavu] First execution including JIT compilation took 2.685384511947632s.
-[triton-dejavu] First execution including JIT compilation took 1.0922198295593262s.
-[triton-dejavu] First execution including JIT compilation took 0.534543514251709s.
-[triton-dejavu] First execution including JIT compilation took 3.543609380722046s.
-[triton-dejavu] First execution including JIT compilation took 1.221311330795288s.
-[triton-dejavu] First execution including JIT compilation took 0.5585613250732422s.
-[triton-dejavu] First execution including JIT compilation took 3.604875087738037s.
-[triton-dejavu] First execution including JIT compilation took 1.3217031955718994s.
-[triton-dejavu] First execution including JIT compilation took 0.6215760707855225s.
-[triton-dejavu] First execution including JIT compilation took 3.679218053817749s.
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 349440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 349440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.055574655532837s.
-[triton-dejavu] First execution including JIT compilation took 1.6165587902069092s.
-[triton-dejavu] First execution including JIT compilation took 0.7123830318450928s.
-[triton-dejavu] First execution including JIT compilation took 5.444074630737305s.
-[triton-dejavu] First execution including JIT compilation took 1.641197681427002s.
-[triton-dejavu] First execution including JIT compilation took 0.818612813949585s.
-[triton-dejavu] First execution including JIT compilation took 10.92102336883545s.
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 698880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 698880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 3.2677865028381348s.
-[triton-dejavu] First execution including JIT compilation took 1.392303705215454s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 798720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 798720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1397760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1397760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 3.4989428520202637s.
-[triton-dejavu] First execution including JIT compilation took 1.8932569026947021s.
-[triton-dejavu] First execution including JIT compilation took 0.732560396194458s.
-[triton-dejavu] First execution including JIT compilation took 3.6859214305877686s.
-[triton-dejavu] First execution including JIT compilation took 1.8280115127563477s.
-[triton-dejavu] First execution including JIT compilation took 0.7882623672485352s.
-[triton-dejavu] First execution including JIT compilation took 3.735793113708496s.
-[triton-dejavu] First execution including JIT compilation took 2.119565486907959s.
-[triton-dejavu] First execution including JIT compilation took 0.8172221183776855s.
-[triton-dejavu] First execution including JIT compilation took 4.001902341842651s.
-[triton-dejavu] First execution including JIT compilation took 1.949631690979004s.
-[triton-dejavu] First execution including JIT compilation took 0.8840606212615967s.
-[triton-dejavu] First execution including JIT compilation took 4.194988489151001s.
-[triton-dejavu] First execution including JIT compilation took 2.0192768573760986s.
-[triton-dejavu] First execution including JIT compilation took 0.8446464538574219s.
-[triton-dejavu] First execution including JIT compilation took 3.981654644012451s.
-[triton-dejavu] First execution including JIT compilation took 2.0983834266662598s.
-[triton-dejavu] First execution including JIT compilation took 0.9342923164367676s.
-[triton-dejavu] First execution including JIT compilation took 3.9925155639648438s.
-[triton-dejavu] First execution including JIT compilation took 2.2087132930755615s.
-[triton-dejavu] First execution including JIT compilation took 0.949195146560669s.
-[triton-dejavu] First execution including JIT compilation took 4.75992751121521s.
-[triton-dejavu] First execution including JIT compilation took 2.1755006313323975s.
-[triton-dejavu] First execution including JIT compilation took 1.1709823608398438s.
-[triton-dejavu] First execution including JIT compilation took 4.726720809936523s.
-[triton-dejavu] First execution including JIT compilation took 2.514338970184326s.
-[triton-dejavu] First execution including JIT compilation took 0.9337425231933594s.
-[triton-dejavu] First execution including JIT compilation took 5.192431449890137s.
-[triton-dejavu] First execution including JIT compilation took 2.444566488265991s.
-[triton-dejavu] First execution including JIT compilation took 0.9995050430297852s.
-[triton-dejavu] First execution including JIT compilation took 5.0464768409729s.
-[triton-dejavu] First execution including JIT compilation took 2.496889352798462s.
-[triton-dejavu] First execution including JIT compilation took 1.0279152393341064s.
-[triton-dejavu] First execution including JIT compilation took 5.1387224197387695s.
-[triton-dejavu] First execution including JIT compilation took 2.8983592987060547s.
-[triton-dejavu] First execution including JIT compilation took 1.048851490020752s.
-[triton-dejavu] First execution including JIT compilation took 5.0407774448394775s.
-[triton-dejavu] First execution including JIT compilation took 2.5578625202178955s.
-[triton-dejavu] First execution including JIT compilation took 1.0612678527832031s.
-bench_cudagraph failed with out of resource: shared memory, Required: 248448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.719822406768799s.
-[triton-dejavu] First execution including JIT compilation took 3.07711124420166s.
-[triton-dejavu] First execution including JIT compilation took 1.6147170066833496s.
-[triton-dejavu] First execution including JIT compilation took 6.198007345199585s.
-[triton-dejavu] First execution including JIT compilation took 3.1891562938690186s.
-[triton-dejavu] First execution including JIT compilation took 1.3675951957702637s.
-[triton-dejavu] First execution including JIT compilation took 6.984339952468872s.
-[triton-dejavu] First execution including JIT compilation took 3.410410165786743s.
-[triton-dejavu] First execution including JIT compilation took 1.380906581878662s.
-[triton-dejavu] First execution including JIT compilation took 7.323652982711792s.
-bench_cudagraph failed with out of resource: shared memory, Required: 264448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 464128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 464128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 529664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 529664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 529664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 529664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 6.1288557052612305s.
-[triton-dejavu] First execution including JIT compilation took 2.2384567260742188s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 528896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 528896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 663040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 794112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 794112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 794112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 794112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 928256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 928256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1059328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1059328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1059328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1059328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1057792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1057792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1057792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1057792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1060864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1060864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1323008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1323008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1323008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1323008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1326080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1326080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1588224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1588224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1588224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1588224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1856512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1856512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2118656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2118656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2118656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2118656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] added BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 64, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None for _chunk_state_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default and key ('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')
-[2025-07-23 14:17:39] Triton autotuning for function _chunk_state_fwd_kernel finished after 9348.03s; best config selected: BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 64, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None with benchmark time 0.003924777265638113;  evaluated 2625 configurations;
-[triton-dejavu] ('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32') not in cache, starting to tune...
-[triton-dejavu] [2025-07-23 14:17:39]  Started benchmarking of 168 configurations... (use_bo: False, run: 0)
-[triton-dejavu] First execution including JIT compilation took 0.09894037246704102s.
-[triton-dejavu] First execution including JIT compilation took 0.09794259071350098s.
-[triton-dejavu] First execution including JIT compilation took 0.09133148193359375s.
-[triton-dejavu] First execution including JIT compilation took 0.09437870979309082s.
-[triton-dejavu] First execution including JIT compilation took 0.1375281810760498s.
-[triton-dejavu] First execution including JIT compilation took 0.08841657638549805s.
-[triton-dejavu] First execution including JIT compilation took 0.09586286544799805s.
-[triton-dejavu] First execution including JIT compilation took 0.0908660888671875s.
-[triton-dejavu] First execution including JIT compilation took 0.0903022289276123s.
-[triton-dejavu] First execution including JIT compilation took 0.11986613273620605s.
-[triton-dejavu] First execution including JIT compilation took 0.09716367721557617s.
-[triton-dejavu] First execution including JIT compilation took 0.10063719749450684s.
-[triton-dejavu] First execution including JIT compilation took 0.10032272338867188s.
-[triton-dejavu] First execution including JIT compilation took 0.1031808853149414s.
-[triton-dejavu] First execution including JIT compilation took 0.10322737693786621s.
-[triton-dejavu] First execution including JIT compilation took 0.09868526458740234s.
-[triton-dejavu] First execution including JIT compilation took 0.10200619697570801s.
-[triton-dejavu] First execution including JIT compilation took 0.10516095161437988s.
-[triton-dejavu] First execution including JIT compilation took 0.10310220718383789s.
-[triton-dejavu] First execution including JIT compilation took 0.10310649871826172s.
-[triton-dejavu] First execution including JIT compilation took 0.0993356704711914s.
-[triton-dejavu] First execution including JIT compilation took 0.0988612174987793s.
-[triton-dejavu] First execution including JIT compilation took 0.10302448272705078s.
-[triton-dejavu] First execution including JIT compilation took 0.10656142234802246s.
-[triton-dejavu] First execution including JIT compilation took 0.10141181945800781s.
-[triton-dejavu] First execution including JIT compilation took 0.09873580932617188s.
-[triton-dejavu] First execution including JIT compilation took 0.09862279891967773s.
-[triton-dejavu] First execution including JIT compilation took 0.10381484031677246s.
-[triton-dejavu] First execution including JIT compilation took 0.003009319305419922s.
-[triton-dejavu] First execution including JIT compilation took 0.0979304313659668s.
-[triton-dejavu] First execution including JIT compilation took 0.10118842124938965s.
-[triton-dejavu] First execution including JIT compilation took 0.1023719310760498s.
-[triton-dejavu] First execution including JIT compilation took 0.09882235527038574s.
-[triton-dejavu] First execution including JIT compilation took 0.10287618637084961s.
-[triton-dejavu] First execution including JIT compilation took 0.10065412521362305s.
-[triton-dejavu] First execution including JIT compilation took 0.1038670539855957s.
-[triton-dejavu] First execution including JIT compilation took 0.10204434394836426s.
-[triton-dejavu] First execution including JIT compilation took 0.10319113731384277s.
-[triton-dejavu] First execution including JIT compilation took 0.10232353210449219s.
-[triton-dejavu] First execution including JIT compilation took 0.09966063499450684s.
-[triton-dejavu] First execution including JIT compilation took 0.1052091121673584s.
-[triton-dejavu] First execution including JIT compilation took 0.09914350509643555s.
-[triton-dejavu] First execution including JIT compilation took 0.10142302513122559s.
-[triton-dejavu] First execution including JIT compilation took 0.10095047950744629s.
-[triton-dejavu] First execution including JIT compilation took 0.0988004207611084s.
-[triton-dejavu] First execution including JIT compilation took 0.10207676887512207s.
-[triton-dejavu] First execution including JIT compilation took 0.10127758979797363s.
-[triton-dejavu] First execution including JIT compilation took 0.10261845588684082s.
-[triton-dejavu] First execution including JIT compilation took 0.10390377044677734s.
-[triton-dejavu] First execution including JIT compilation took 0.0035130977630615234s.
-[triton-dejavu] First execution including JIT compilation took 0.10114288330078125s.
-[triton-dejavu] First execution including JIT compilation took 0.10671138763427734s.
-[triton-dejavu] First execution including JIT compilation took 0.09997725486755371s.
-[triton-dejavu] First execution including JIT compilation took 0.1009225845336914s.
-[triton-dejavu] First execution including JIT compilation took 0.1023416519165039s.
-[triton-dejavu] First execution including JIT compilation took 0.10103917121887207s.
-[triton-dejavu] First execution including JIT compilation took 0.10527372360229492s.
-[triton-dejavu] First execution including JIT compilation took 0.1026146411895752s.
-[triton-dejavu] First execution including JIT compilation took 0.09933710098266602s.
-[triton-dejavu] First execution including JIT compilation took 0.10023188591003418s.
-[triton-dejavu] First execution including JIT compilation took 0.10262489318847656s.
-[triton-dejavu] First execution including JIT compilation took 0.10224008560180664s.
-[triton-dejavu] First execution including JIT compilation took 0.1036233901977539s.
-[triton-dejavu] First execution including JIT compilation took 0.10158872604370117s.
-[triton-dejavu] First execution including JIT compilation took 0.10198545455932617s.
-[triton-dejavu] First execution including JIT compilation took 0.09837794303894043s.
-[triton-dejavu] First execution including JIT compilation took 0.10406804084777832s.
-[triton-dejavu] First execution including JIT compilation took 0.09899592399597168s.
-[triton-dejavu] First execution including JIT compilation took 0.10302305221557617s.
-[triton-dejavu] First execution including JIT compilation took 0.10536479949951172s.
-[triton-dejavu] First execution including JIT compilation took 0.0029289722442626953s.
-[triton-dejavu] First execution including JIT compilation took 0.10014629364013672s.
-[triton-dejavu] First execution including JIT compilation took 0.10506725311279297s.
-[triton-dejavu] First execution including JIT compilation took 0.10248923301696777s.
-[triton-dejavu] First execution including JIT compilation took 0.14687442779541016s.
-[triton-dejavu] First execution including JIT compilation took 0.10432600975036621s.
-[triton-dejavu] First execution including JIT compilation took 0.10353231430053711s.
-[triton-dejavu] First execution including JIT compilation took 0.09760165214538574s.
-[triton-dejavu] First execution including JIT compilation took 0.10556888580322266s.
-[triton-dejavu] First execution including JIT compilation took 0.10235834121704102s.
-[triton-dejavu] First execution including JIT compilation took 0.10123991966247559s.
-[triton-dejavu] First execution including JIT compilation took 0.10434556007385254s.
-[triton-dejavu] First execution including JIT compilation took 0.10260486602783203s.
-[triton-dejavu] First execution including JIT compilation took 0.09864091873168945s.
-[triton-dejavu] First execution including JIT compilation took 0.11648225784301758s.
-[triton-dejavu] First execution including JIT compilation took 0.10767412185668945s.
-[triton-dejavu] First execution including JIT compilation took 0.09799385070800781s.
-[triton-dejavu] First execution including JIT compilation took 0.12165379524230957s.
-[triton-dejavu] First execution including JIT compilation took 0.10323834419250488s.
-[triton-dejavu] First execution including JIT compilation took 0.1541898250579834s.
-[triton-dejavu] First execution including JIT compilation took 0.12132406234741211s.
-[triton-dejavu] First execution including JIT compilation took 0.0029497146606445312s.
-[triton-dejavu] First execution including JIT compilation took 0.10245823860168457s.
-[triton-dejavu] First execution including JIT compilation took 0.12180399894714355s.
-[triton-dejavu] First execution including JIT compilation took 0.10425543785095215s.
-[triton-dejavu] First execution including JIT compilation took 0.09953522682189941s.
-[triton-dejavu] First execution including JIT compilation took 0.11828899383544922s.
-[triton-dejavu] First execution including JIT compilation took 0.1033179759979248s.
-[triton-dejavu] First execution including JIT compilation took 0.09932780265808105s.
-[triton-dejavu] First execution including JIT compilation took 0.12465763092041016s.
-[triton-dejavu] First execution including JIT compilation took 0.10105299949645996s.
-[triton-dejavu] First execution including JIT compilation took 0.10276103019714355s.
-[triton-dejavu] First execution including JIT compilation took 0.11753654479980469s.
-[triton-dejavu] First execution including JIT compilation took 0.10199093818664551s.
-[triton-dejavu] First execution including JIT compilation took 0.10073208808898926s.
-[triton-dejavu] First execution including JIT compilation took 0.13183021545410156s.
-[triton-dejavu] First execution including JIT compilation took 0.11950993537902832s.
-[triton-dejavu] First execution including JIT compilation took 0.10588884353637695s.
-[triton-dejavu] First execution including JIT compilation took 0.12990760803222656s.
-[triton-dejavu] First execution including JIT compilation took 0.12318229675292969s.
-[triton-dejavu] First execution including JIT compilation took 0.10443115234375s.
-[triton-dejavu] First execution including JIT compilation took 0.1310744285583496s.
-[triton-dejavu] First execution including JIT compilation took 0.0029449462890625s.
-[triton-dejavu] First execution including JIT compilation took 0.10070252418518066s.
-[triton-dejavu] First execution including JIT compilation took 0.13354873657226562s.
-[triton-dejavu] First execution including JIT compilation took 0.1201629638671875s.
-[triton-dejavu] First execution including JIT compilation took 0.10523557662963867s.
-[triton-dejavu] First execution including JIT compilation took 0.12919878959655762s.
-[triton-dejavu] First execution including JIT compilation took 0.12137508392333984s.
-[triton-dejavu] First execution including JIT compilation took 0.10500240325927734s.
-[triton-dejavu] First execution including JIT compilation took 0.1276566982269287s.
-[triton-dejavu] First execution including JIT compilation took 0.12091207504272461s.
-[triton-dejavu] First execution including JIT compilation took 0.0987248420715332s.
-[triton-dejavu] First execution including JIT compilation took 0.12950658798217773s.
-[triton-dejavu] First execution including JIT compilation took 0.12207913398742676s.
-[triton-dejavu] First execution including JIT compilation took 0.10215497016906738s.
-[triton-dejavu] First execution including JIT compilation took 0.16368603706359863s.
-[triton-dejavu] First execution including JIT compilation took 0.1331336498260498s.
-[triton-dejavu] First execution including JIT compilation took 0.12007308006286621s.
-[triton-dejavu] First execution including JIT compilation took 0.1515827178955078s.
-[triton-dejavu] First execution including JIT compilation took 0.1267712116241455s.
-[triton-dejavu] First execution including JIT compilation took 0.12165069580078125s.
-[triton-dejavu] First execution including JIT compilation took 0.1454930305480957s.
-[triton-dejavu] First execution including JIT compilation took 0.003009319305419922s.
-[triton-dejavu] First execution including JIT compilation took 0.11997270584106445s.
-[triton-dejavu] First execution including JIT compilation took 0.15126681327819824s.
-[triton-dejavu] First execution including JIT compilation took 0.12771224975585938s.
-[triton-dejavu] First execution including JIT compilation took 0.11920666694641113s.
-[triton-dejavu] First execution including JIT compilation took 0.14812898635864258s.
-[triton-dejavu] First execution including JIT compilation took 0.13222432136535645s.
-[triton-dejavu] First execution including JIT compilation took 0.11958074569702148s.
-[triton-dejavu] First execution including JIT compilation took 0.15191054344177246s.
-[triton-dejavu] First execution including JIT compilation took 0.13075757026672363s.
-[triton-dejavu] First execution including JIT compilation took 0.11592960357666016s.
-[triton-dejavu] First execution including JIT compilation took 0.14753198623657227s.
-[triton-dejavu] First execution including JIT compilation took 0.1284317970275879s.
-[triton-dejavu] First execution including JIT compilation took 0.12173199653625488s.
-[triton-dejavu] First execution including JIT compilation took 0.18095922470092773s.
-[triton-dejavu] First execution including JIT compilation took 0.14812517166137695s.
-[triton-dejavu] First execution including JIT compilation took 0.1308438777923584s.
-[triton-dejavu] First execution including JIT compilation took 0.18799710273742676s.
-[triton-dejavu] First execution including JIT compilation took 0.14983463287353516s.
-[triton-dejavu] First execution including JIT compilation took 0.13013529777526855s.
-[triton-dejavu] First execution including JIT compilation took 0.22894525527954102s.
-[triton-dejavu] First execution including JIT compilation took 0.14696717262268066s.
-[triton-dejavu] First execution including JIT compilation took 0.1867072582244873s.
-[triton-dejavu] First execution including JIT compilation took 0.18308377265930176s.
-[triton-dejavu] First execution including JIT compilation took 0.15039896965026855s.
-[triton-dejavu] First execution including JIT compilation took 0.12948107719421387s.
-[triton-dejavu] First execution including JIT compilation took 0.1869828701019287s.
-[triton-dejavu] First execution including JIT compilation took 0.15206503868103027s.
-[triton-dejavu] First execution including JIT compilation took 0.12766575813293457s.
-[triton-dejavu] First execution including JIT compilation took 0.19213628768920898s.
-[triton-dejavu] First execution including JIT compilation took 0.14951205253601074s.
-[triton-dejavu] First execution including JIT compilation took 0.131011962890625s.
-[triton-dejavu] First execution including JIT compilation took 0.19015717506408691s.
-[triton-dejavu] First execution including JIT compilation took 0.14911913871765137s.
-[triton-dejavu] First execution including JIT compilation took 0.1303267478942871s.
-[triton-dejavu] added BLOCK_SIZE: 512, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None for _state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default and key ('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')
-[2025-07-23 14:22:15] Triton autotuning for function _state_passing_fwd_kernel finished after 275.26s; best config selected: BLOCK_SIZE: 512, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None with benchmark time 0.0030820679385215044;  evaluated 168 configurations;
-[triton-dejavu] ('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32') not in cache, starting to tune...
-[triton-dejavu] [2025-07-23 14:22:15]  Started benchmarking of 2625 configurations... (use_bo: False, run: 0)
-[triton-dejavu] First execution including JIT compilation took 0.1971287727355957s.
-[triton-dejavu] First execution including JIT compilation took 0.18145108222961426s.
-[triton-dejavu] First execution including JIT compilation took 0.18181228637695312s.
-[triton-dejavu] First execution including JIT compilation took 0.20481252670288086s.
-[triton-dejavu] First execution including JIT compilation took 0.19466614723205566s.
-[triton-dejavu] First execution including JIT compilation took 0.17426085472106934s.
-[triton-dejavu] First execution including JIT compilation took 0.21188688278198242s.
-[triton-dejavu] First execution including JIT compilation took 0.20443081855773926s.
-[triton-dejavu] First execution including JIT compilation took 0.18296051025390625s.
-[triton-dejavu] First execution including JIT compilation took 0.21415448188781738s.
-[triton-dejavu] First execution including JIT compilation took 0.20465874671936035s.
-[triton-dejavu] First execution including JIT compilation took 0.1801447868347168s.
-[triton-dejavu] First execution including JIT compilation took 0.21986842155456543s.
-[triton-dejavu] First execution including JIT compilation took 0.2162468433380127s.
-[triton-dejavu] First execution including JIT compilation took 0.17408537864685059s.
-[triton-dejavu] First execution including JIT compilation took 0.23129940032958984s.
-[triton-dejavu] First execution including JIT compilation took 0.22765421867370605s.
-[triton-dejavu] First execution including JIT compilation took 0.171464204788208s.
-[triton-dejavu] First execution including JIT compilation took 0.24284863471984863s.
-[triton-dejavu] First execution including JIT compilation took 0.2351231575012207s.
-[triton-dejavu] First execution including JIT compilation took 0.17095470428466797s.
-[triton-dejavu] First execution including JIT compilation took 0.19266152381896973s.
-[triton-dejavu] First execution including JIT compilation took 0.19104242324829102s.
-[triton-dejavu] First execution including JIT compilation took 0.1844947338104248s.
-[triton-dejavu] First execution including JIT compilation took 0.20961451530456543s.
-[triton-dejavu] First execution including JIT compilation took 0.20723509788513184s.
-[triton-dejavu] First execution including JIT compilation took 0.20163917541503906s.
-[triton-dejavu] First execution including JIT compilation took 0.22046804428100586s.
-[triton-dejavu] First execution including JIT compilation took 0.21246051788330078s.
-[triton-dejavu] First execution including JIT compilation took 0.21422886848449707s.
-[triton-dejavu] First execution including JIT compilation took 0.2310943603515625s.
-[triton-dejavu] First execution including JIT compilation took 0.22539591789245605s.
-[triton-dejavu] First execution including JIT compilation took 0.21231532096862793s.
-[triton-dejavu] First execution including JIT compilation took 0.23757481575012207s.
-[triton-dejavu] First execution including JIT compilation took 0.2188396453857422s.
-[triton-dejavu] First execution including JIT compilation took 0.22507905960083008s.
-[triton-dejavu] First execution including JIT compilation took 0.25089573860168457s.
-[triton-dejavu] First execution including JIT compilation took 0.2278902530670166s.
-[triton-dejavu] First execution including JIT compilation took 0.22785520553588867s.
-[triton-dejavu] First execution including JIT compilation took 0.2572140693664551s.
-[triton-dejavu] First execution including JIT compilation took 0.24475598335266113s.
-[triton-dejavu] First execution including JIT compilation took 0.275850772857666s.
-[triton-dejavu] First execution including JIT compilation took 0.21295976638793945s.
-[triton-dejavu] First execution including JIT compilation took 0.19115710258483887s.
-[triton-dejavu] First execution including JIT compilation took 0.18381786346435547s.
-[triton-dejavu] First execution including JIT compilation took 0.21824884414672852s.
-[triton-dejavu] First execution including JIT compilation took 0.2043604850769043s.
-[triton-dejavu] First execution including JIT compilation took 0.2003331184387207s.
-[triton-dejavu] First execution including JIT compilation took 0.2335672378540039s.
-[triton-dejavu] First execution including JIT compilation took 0.21690750122070312s.
-[triton-dejavu] First execution including JIT compilation took 0.18306660652160645s.
-[triton-dejavu] First execution including JIT compilation took 0.2298128604888916s.
-[triton-dejavu] First execution including JIT compilation took 0.19614720344543457s.
-[triton-dejavu] First execution including JIT compilation took 0.2060997486114502s.
-[triton-dejavu] First execution including JIT compilation took 0.21973228454589844s.
-[triton-dejavu] First execution including JIT compilation took 0.18016934394836426s.
-[triton-dejavu] First execution including JIT compilation took 0.20123672485351562s.
-[triton-dejavu] First execution including JIT compilation took 0.24709606170654297s.
-[triton-dejavu] First execution including JIT compilation took 0.20981693267822266s.
-[triton-dejavu] First execution including JIT compilation took 0.2014932632446289s.
-[triton-dejavu] First execution including JIT compilation took 0.25247669219970703s.
-[triton-dejavu] First execution including JIT compilation took 0.20742201805114746s.
-[triton-dejavu] First execution including JIT compilation took 0.23698949813842773s.
-[triton-dejavu] First execution including JIT compilation took 0.20906853675842285s.
-[triton-dejavu] First execution including JIT compilation took 0.19327425956726074s.
-[triton-dejavu] First execution including JIT compilation took 0.1958320140838623s.
-[triton-dejavu] First execution including JIT compilation took 0.22327661514282227s.
-[triton-dejavu] First execution including JIT compilation took 0.2055678367614746s.
-[triton-dejavu] First execution including JIT compilation took 0.22722506523132324s.
-[triton-dejavu] First execution including JIT compilation took 0.29752469062805176s.
-[triton-dejavu] First execution including JIT compilation took 0.24663901329040527s.
-[triton-dejavu] First execution including JIT compilation took 0.22910308837890625s.
-[triton-dejavu] First execution including JIT compilation took 0.30620813369750977s.
-[triton-dejavu] First execution including JIT compilation took 0.2616004943847656s.
-[triton-dejavu] First execution including JIT compilation took 0.24086618423461914s.
-[triton-dejavu] First execution including JIT compilation took 0.31242823600769043s.
-[triton-dejavu] First execution including JIT compilation took 0.26506876945495605s.
-[triton-dejavu] First execution including JIT compilation took 0.24354910850524902s.
-[triton-dejavu] First execution including JIT compilation took 0.33380126953125s.
-[triton-dejavu] First execution including JIT compilation took 0.27773475646972656s.
-[triton-dejavu] First execution including JIT compilation took 0.25086140632629395s.
-[triton-dejavu] First execution including JIT compilation took 0.3661017417907715s.
-[triton-dejavu] First execution including JIT compilation took 0.29815053939819336s.
-[triton-dejavu] First execution including JIT compilation took 0.26163578033447266s.
-[triton-dejavu] First execution including JIT compilation took 0.32303476333618164s.
-[triton-dejavu] First execution including JIT compilation took 0.26395726203918457s.
-[triton-dejavu] First execution including JIT compilation took 0.24637055397033691s.
-[triton-dejavu] First execution including JIT compilation took 0.35486674308776855s.
-[triton-dejavu] First execution including JIT compilation took 0.2868657112121582s.
-[triton-dejavu] First execution including JIT compilation took 0.25715160369873047s.
-[triton-dejavu] First execution including JIT compilation took 0.38285207748413086s.
-[triton-dejavu] First execution including JIT compilation took 0.31813502311706543s.
-[triton-dejavu] First execution including JIT compilation took 0.2733750343322754s.
-[triton-dejavu] First execution including JIT compilation took 0.4395263195037842s.
-[triton-dejavu] First execution including JIT compilation took 0.3358616828918457s.
-[triton-dejavu] First execution including JIT compilation took 0.29150915145874023s.
-[triton-dejavu] First execution including JIT compilation took 0.5277695655822754s.
-[triton-dejavu] First execution including JIT compilation took 0.3672621250152588s.
-[triton-dejavu] First execution including JIT compilation took 0.3096439838409424s.
-[triton-dejavu] First execution including JIT compilation took 0.5141489505767822s.
-[triton-dejavu] First execution including JIT compilation took 0.37669992446899414s.
-[triton-dejavu] First execution including JIT compilation took 0.311464786529541s.
-[triton-dejavu] First execution including JIT compilation took 0.5582582950592041s.
-[triton-dejavu] First execution including JIT compilation took 0.4001944065093994s.
-[triton-dejavu] First execution including JIT compilation took 0.3267343044281006s.
-[triton-dejavu] First execution including JIT compilation took 0.19158482551574707s.
-[triton-dejavu] First execution including JIT compilation took 0.18754005432128906s.
-[triton-dejavu] First execution including JIT compilation took 0.17347002029418945s.
-[triton-dejavu] First execution including JIT compilation took 0.20695137977600098s.
-[triton-dejavu] First execution including JIT compilation took 0.1933000087738037s.
-[triton-dejavu] First execution including JIT compilation took 0.18866968154907227s.
-[triton-dejavu] First execution including JIT compilation took 0.21515560150146484s.
-[triton-dejavu] First execution including JIT compilation took 0.22445225715637207s.
-[triton-dejavu] First execution including JIT compilation took 0.19047832489013672s.
-[triton-dejavu] First execution including JIT compilation took 0.21751952171325684s.
-[triton-dejavu] First execution including JIT compilation took 0.2095792293548584s.
-[triton-dejavu] First execution including JIT compilation took 0.1899867057800293s.
-[triton-dejavu] First execution including JIT compilation took 0.22539901733398438s.
-[triton-dejavu] First execution including JIT compilation took 0.214493989944458s.
-[triton-dejavu] First execution including JIT compilation took 0.19467592239379883s.
-[triton-dejavu] First execution including JIT compilation took 0.23510289192199707s.
-[triton-dejavu] First execution including JIT compilation took 0.23056340217590332s.
-[triton-dejavu] First execution including JIT compilation took 0.20400166511535645s.
-[triton-dejavu] First execution including JIT compilation took 0.25162243843078613s.
-[triton-dejavu] First execution including JIT compilation took 0.24202203750610352s.
-[triton-dejavu] First execution including JIT compilation took 0.2179243564605713s.
-[triton-dejavu] First execution including JIT compilation took 0.20107483863830566s.
-[triton-dejavu] First execution including JIT compilation took 0.19151616096496582s.
-[triton-dejavu] First execution including JIT compilation took 0.1852731704711914s.
-[triton-dejavu] First execution including JIT compilation took 0.2255268096923828s.
-[triton-dejavu] First execution including JIT compilation took 0.20557308197021484s.
-[triton-dejavu] First execution including JIT compilation took 0.19893908500671387s.
-[triton-dejavu] First execution including JIT compilation took 0.2369074821472168s.
-[triton-dejavu] First execution including JIT compilation took 0.22331023216247559s.
-[triton-dejavu] First execution including JIT compilation took 0.2091996669769287s.
-[triton-dejavu] First execution including JIT compilation took 0.2456064224243164s.
-[triton-dejavu] First execution including JIT compilation took 0.24354219436645508s.
-[triton-dejavu] First execution including JIT compilation took 0.2187485694885254s.
-[triton-dejavu] First execution including JIT compilation took 0.26705098152160645s.
-[triton-dejavu] First execution including JIT compilation took 0.22890710830688477s.
-[triton-dejavu] First execution including JIT compilation took 0.228562593460083s.
-[triton-dejavu] First execution including JIT compilation took 0.260115385055542s.
-[triton-dejavu] First execution including JIT compilation took 0.23436951637268066s.
-[triton-dejavu] First execution including JIT compilation took 0.24272942543029785s.
-[triton-dejavu] First execution including JIT compilation took 0.2728395462036133s.
-[triton-dejavu] First execution including JIT compilation took 0.25573110580444336s.
-[triton-dejavu] First execution including JIT compilation took 0.239990234375s.
-[triton-dejavu] First execution including JIT compilation took 0.23782968521118164s.
-[triton-dejavu] First execution including JIT compilation took 0.20571660995483398s.
-[triton-dejavu] First execution including JIT compilation took 0.1985173225402832s.
-[triton-dejavu] First execution including JIT compilation took 0.23909878730773926s.
-[triton-dejavu] First execution including JIT compilation took 0.21619272232055664s.
-[triton-dejavu] First execution including JIT compilation took 0.205078125s.
-[triton-dejavu] First execution including JIT compilation took 0.25579833984375s.
-[triton-dejavu] First execution including JIT compilation took 0.22826004028320312s.
-[triton-dejavu] First execution including JIT compilation took 0.21488690376281738s.
-[triton-dejavu] First execution including JIT compilation took 0.2719230651855469s.
-[triton-dejavu] First execution including JIT compilation took 0.237349271774292s.
-[triton-dejavu] First execution including JIT compilation took 0.22726154327392578s.
-[triton-dejavu] First execution including JIT compilation took 0.29409146308898926s.
-[triton-dejavu] First execution including JIT compilation took 0.2537970542907715s.
-[triton-dejavu] First execution including JIT compilation took 0.2349834442138672s.
-[triton-dejavu] First execution including JIT compilation took 0.3101375102996826s.
-[triton-dejavu] First execution including JIT compilation took 0.25778889656066895s.
-[triton-dejavu] First execution including JIT compilation took 0.2488398551940918s.
-[triton-dejavu] First execution including JIT compilation took 0.3380768299102783s.
-[triton-dejavu] First execution including JIT compilation took 0.24480175971984863s.
-[triton-dejavu] First execution including JIT compilation took 0.24767565727233887s.
-[triton-dejavu] First execution including JIT compilation took 0.24734854698181152s.
-[triton-dejavu] First execution including JIT compilation took 0.22959280014038086s.
-[triton-dejavu] First execution including JIT compilation took 0.19723773002624512s.
-[triton-dejavu] First execution including JIT compilation took 0.2590954303741455s.
-[triton-dejavu] First execution including JIT compilation took 0.23061442375183105s.
-[triton-dejavu] First execution including JIT compilation took 0.19670867919921875s.
-[triton-dejavu] First execution including JIT compilation took 0.3171346187591553s.
-[triton-dejavu] First execution including JIT compilation took 0.232527494430542s.
-[triton-dejavu] First execution including JIT compilation took 0.22737908363342285s.
-[triton-dejavu] First execution including JIT compilation took 0.3899686336517334s.
-[triton-dejavu] First execution including JIT compilation took 0.278536319732666s.
-[triton-dejavu] First execution including JIT compilation took 0.24887967109680176s.
-[triton-dejavu] First execution including JIT compilation took 0.40813517570495605s.
-[triton-dejavu] First execution including JIT compilation took 0.29118990898132324s.
-[triton-dejavu] First execution including JIT compilation took 0.25837135314941406s.
-[triton-dejavu] First execution including JIT compilation took 0.42389464378356934s.
-[triton-dejavu] First execution including JIT compilation took 0.3060779571533203s.
-[triton-dejavu] First execution including JIT compilation took 0.26450538635253906s.
-[triton-dejavu] First execution including JIT compilation took 0.49116992950439453s.
-[triton-dejavu] First execution including JIT compilation took 0.33746862411499023s.
-[triton-dejavu] First execution including JIT compilation took 0.28478169441223145s.
-[triton-dejavu] First execution including JIT compilation took 0.41333556175231934s.
-[triton-dejavu] First execution including JIT compilation took 0.29308557510375977s.
-[triton-dejavu] First execution including JIT compilation took 0.2590651512145996s.
-[triton-dejavu] First execution including JIT compilation took 0.4438004493713379s.
-[triton-dejavu] First execution including JIT compilation took 0.38523340225219727s.
-[triton-dejavu] First execution including JIT compilation took 0.2776186466217041s.
-[triton-dejavu] First execution including JIT compilation took 0.5478754043579102s.
-[triton-dejavu] First execution including JIT compilation took 0.3559560775756836s.
-[triton-dejavu] First execution including JIT compilation took 0.29976773262023926s.
-[triton-dejavu] First execution including JIT compilation took 0.5995876789093018s.
-[triton-dejavu] First execution including JIT compilation took 0.38107895851135254s.
-[triton-dejavu] First execution including JIT compilation took 0.3243865966796875s.
-[triton-dejavu] First execution including JIT compilation took 0.7119507789611816s.
-[triton-dejavu] First execution including JIT compilation took 0.4182438850402832s.
-[triton-dejavu] First execution including JIT compilation took 0.33512067794799805s.
-[triton-dejavu] First execution including JIT compilation took 0.7309103012084961s.
-[triton-dejavu] First execution including JIT compilation took 0.4567563533782959s.
-[triton-dejavu] First execution including JIT compilation took 0.3427090644836426s.
-[triton-dejavu] First execution including JIT compilation took 0.7929611206054688s.
-[triton-dejavu] First execution including JIT compilation took 0.7045941352844238s.
-[triton-dejavu] First execution including JIT compilation took 0.36629557609558105s.
-[triton-dejavu] First execution including JIT compilation took 0.22418737411499023s.
-[triton-dejavu] First execution including JIT compilation took 0.1955420970916748s.
-[triton-dejavu] First execution including JIT compilation took 0.44211864471435547s.
-[triton-dejavu] First execution including JIT compilation took 0.23611164093017578s.
-[triton-dejavu] First execution including JIT compilation took 0.36040568351745605s.
-[triton-dejavu] First execution including JIT compilation took 0.19045591354370117s.
-[triton-dejavu] First execution including JIT compilation took 0.24911093711853027s.
-[triton-dejavu] First execution including JIT compilation took 0.2074282169342041s.
-[triton-dejavu] First execution including JIT compilation took 0.19919967651367188s.
-[triton-dejavu] First execution including JIT compilation took 0.24654054641723633s.
-[triton-dejavu] First execution including JIT compilation took 0.21567964553833008s.
-[triton-dejavu] First execution including JIT compilation took 0.2518477439880371s.
-[triton-dejavu] First execution including JIT compilation took 0.2529888153076172s.
-[triton-dejavu] First execution including JIT compilation took 0.22897052764892578s.
-[triton-dejavu] First execution including JIT compilation took 0.2083446979522705s.
-[triton-dejavu] First execution including JIT compilation took 0.2635183334350586s.
-[triton-dejavu] First execution including JIT compilation took 0.2431652545928955s.
-[triton-dejavu] First execution including JIT compilation took 0.20887160301208496s.
-[triton-dejavu] First execution including JIT compilation took 0.29308581352233887s.
-[triton-dejavu] First execution including JIT compilation took 0.25936341285705566s.
-[triton-dejavu] First execution including JIT compilation took 0.22800016403198242s.
-[triton-dejavu] First execution including JIT compilation took 0.24914908409118652s.
-[triton-dejavu] First execution including JIT compilation took 0.21156573295593262s.
-[triton-dejavu] First execution including JIT compilation took 0.19665884971618652s.
-[triton-dejavu] First execution including JIT compilation took 0.2679252624511719s.
-[triton-dejavu] First execution including JIT compilation took 0.23529791831970215s.
-[triton-dejavu] First execution including JIT compilation took 0.21185588836669922s.
-[triton-dejavu] First execution including JIT compilation took 0.27913832664489746s.
-[triton-dejavu] First execution including JIT compilation took 0.2429966926574707s.
-[triton-dejavu] First execution including JIT compilation took 0.22402739524841309s.
-[triton-dejavu] First execution including JIT compilation took 0.28855395317077637s.
-[triton-dejavu] First execution including JIT compilation took 0.25331878662109375s.
-[triton-dejavu] First execution including JIT compilation took 0.2378528118133545s.
-[triton-dejavu] First execution including JIT compilation took 0.30858898162841797s.
-[triton-dejavu] First execution including JIT compilation took 0.2576262950897217s.
-[triton-dejavu] First execution including JIT compilation took 0.23561835289001465s.
-[triton-dejavu] First execution including JIT compilation took 0.3155839443206787s.
-[triton-dejavu] First execution including JIT compilation took 0.26711153984069824s.
-[triton-dejavu] First execution including JIT compilation took 0.241225004196167s.
-[triton-dejavu] First execution including JIT compilation took 0.3228023052215576s.
-[triton-dejavu] First execution including JIT compilation took 0.2799217700958252s.
-[triton-dejavu] First execution including JIT compilation took 0.2530026435852051s.
-[triton-dejavu] First execution including JIT compilation took 0.2789144515991211s.
-[triton-dejavu] First execution including JIT compilation took 0.23478245735168457s.
-[triton-dejavu] First execution including JIT compilation took 0.20740914344787598s.
-[triton-dejavu] First execution including JIT compilation took 0.29624366760253906s.
-[triton-dejavu] First execution including JIT compilation took 0.24002695083618164s.
-[triton-dejavu] First execution including JIT compilation took 0.21425414085388184s.
-[triton-dejavu] First execution including JIT compilation took 0.3217945098876953s.
-[triton-dejavu] First execution including JIT compilation took 0.2628786563873291s.
-[triton-dejavu] First execution including JIT compilation took 0.22865581512451172s.
-[triton-dejavu] First execution including JIT compilation took 0.3735010623931885s.
-[triton-dejavu] First execution including JIT compilation took 0.27600812911987305s.
-[triton-dejavu] First execution including JIT compilation took 0.24083590507507324s.
-[triton-dejavu] First execution including JIT compilation took 0.37182092666625977s.
-[triton-dejavu] First execution including JIT compilation took 0.2857823371887207s.
-[triton-dejavu] First execution including JIT compilation took 0.2504265308380127s.
-[triton-dejavu] First execution including JIT compilation took 0.39750146865844727s.
-[triton-dejavu] First execution including JIT compilation took 0.30864596366882324s.
-[triton-dejavu] First execution including JIT compilation took 0.2660682201385498s.
-[triton-dejavu] First execution including JIT compilation took 0.4444742202758789s.
-[triton-dejavu] First execution including JIT compilation took 0.33436083793640137s.
-[triton-dejavu] First execution including JIT compilation took 0.27864575386047363s.
-[triton-dejavu] First execution including JIT compilation took 0.3996555805206299s.
-[triton-dejavu] First execution including JIT compilation took 0.29685044288635254s.
-[triton-dejavu] First execution including JIT compilation took 0.25818443298339844s.
-[triton-dejavu] First execution including JIT compilation took 0.4117124080657959s.
-[triton-dejavu] First execution including JIT compilation took 0.3763446807861328s.
-[triton-dejavu] First execution including JIT compilation took 0.24976181983947754s.
-[triton-dejavu] First execution including JIT compilation took 0.491518497467041s.
-[triton-dejavu] First execution including JIT compilation took 0.3266887664794922s.
-[triton-dejavu] First execution including JIT compilation took 0.2635648250579834s.
-[triton-dejavu] First execution including JIT compilation took 0.5234048366546631s.
-[triton-dejavu] First execution including JIT compilation took 0.3361480236053467s.
-[triton-dejavu] First execution including JIT compilation took 0.27045202255249023s.
-[triton-dejavu] First execution including JIT compilation took 0.5627446174621582s.
-[triton-dejavu] First execution including JIT compilation took 0.356564998626709s.
-[triton-dejavu] First execution including JIT compilation took 0.288219690322876s.
-[triton-dejavu] First execution including JIT compilation took 0.5783913135528564s.
-[triton-dejavu] First execution including JIT compilation took 0.3844006061553955s.
-[triton-dejavu] First execution including JIT compilation took 0.2928352355957031s.
-[triton-dejavu] First execution including JIT compilation took 0.6556406021118164s.
-[triton-dejavu] First execution including JIT compilation took 0.41593003273010254s.
-[triton-dejavu] First execution including JIT compilation took 0.3149592876434326s.
-[triton-dejavu] First execution including JIT compilation took 0.6277866363525391s.
-[triton-dejavu] First execution including JIT compilation took 0.36339282989501953s.
-[triton-dejavu] First execution including JIT compilation took 0.27582693099975586s.
-[triton-dejavu] First execution including JIT compilation took 0.6630880832672119s.
-[triton-dejavu] First execution including JIT compilation took 0.3871643543243408s.
-[triton-dejavu] First execution including JIT compilation took 0.29570579528808594s.
-[triton-dejavu] First execution including JIT compilation took 1.1634502410888672s.
-[triton-dejavu] First execution including JIT compilation took 0.5136749744415283s.
-[triton-dejavu] First execution including JIT compilation took 0.34021830558776855s.
-[triton-dejavu] First execution including JIT compilation took 1.281764268875122s.
-[triton-dejavu] First execution including JIT compilation took 0.5489327907562256s.
-[triton-dejavu] First execution including JIT compilation took 0.36876344680786133s.
-[triton-dejavu] First execution including JIT compilation took 1.3639161586761475s.
-[triton-dejavu] First execution including JIT compilation took 0.6005148887634277s.
-[triton-dejavu] First execution including JIT compilation took 0.3901093006134033s.
-[triton-dejavu] First execution including JIT compilation took 1.4384934902191162s.
-[triton-dejavu] First execution including JIT compilation took 0.6145901679992676s.
-[triton-dejavu] First execution including JIT compilation took 0.444568395614624s.
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.257260799407959s.
-[triton-dejavu] First execution including JIT compilation took 0.21925759315490723s.
-[triton-dejavu] First execution including JIT compilation took 0.19404125213623047s.
-[triton-dejavu] First execution including JIT compilation took 0.27055978775024414s.
-[triton-dejavu] First execution including JIT compilation took 0.23495268821716309s.
-[triton-dejavu] First execution including JIT compilation took 0.2018728256225586s.
-[triton-dejavu] First execution including JIT compilation took 0.28210949897766113s.
-[triton-dejavu] First execution including JIT compilation took 0.24408984184265137s.
-[triton-dejavu] First execution including JIT compilation took 0.20690345764160156s.
-[triton-dejavu] First execution including JIT compilation took 0.2943837642669678s.
-[triton-dejavu] First execution including JIT compilation took 0.2537810802459717s.
-[triton-dejavu] First execution including JIT compilation took 0.2101125717163086s.
-[triton-dejavu] First execution including JIT compilation took 0.32114124298095703s.
-[triton-dejavu] First execution including JIT compilation took 0.2669949531555176s.
-[triton-dejavu] First execution including JIT compilation took 0.2184464931488037s.
-[triton-dejavu] First execution including JIT compilation took 0.3285989761352539s.
-[triton-dejavu] First execution including JIT compilation took 0.2849769592285156s.
-[triton-dejavu] First execution including JIT compilation took 0.22274112701416016s.
-[triton-dejavu] First execution including JIT compilation took 0.35292792320251465s.
-[triton-dejavu] First execution including JIT compilation took 0.30437779426574707s.
-[triton-dejavu] First execution including JIT compilation took 0.2387676239013672s.
-[triton-dejavu] First execution including JIT compilation took 0.30469393730163574s.
-[triton-dejavu] First execution including JIT compilation took 0.2520115375518799s.
-[triton-dejavu] First execution including JIT compilation took 0.2158830165863037s.
-[triton-dejavu] First execution including JIT compilation took 0.3266003131866455s.
-[triton-dejavu] First execution including JIT compilation took 0.3258554935455322s.
-[triton-dejavu] First execution including JIT compilation took 0.23098182678222656s.
-[triton-dejavu] First execution including JIT compilation took 0.32482099533081055s.
-[triton-dejavu] First execution including JIT compilation took 0.2595548629760742s.
-[triton-dejavu] First execution including JIT compilation took 0.22449946403503418s.
-[triton-dejavu] First execution including JIT compilation took 0.31243300437927246s.
-[triton-dejavu] First execution including JIT compilation took 0.29460978507995605s.
-[triton-dejavu] First execution including JIT compilation took 0.23943471908569336s.
-[triton-dejavu] First execution including JIT compilation took 0.33672523498535156s.
-[triton-dejavu] First execution including JIT compilation took 0.2958707809448242s.
-[triton-dejavu] First execution including JIT compilation took 0.24780011177062988s.
-[triton-dejavu] First execution including JIT compilation took 0.3855319023132324s.
-[triton-dejavu] First execution including JIT compilation took 0.31192684173583984s.
-[triton-dejavu] First execution including JIT compilation took 0.2505671977996826s.
-[triton-dejavu] First execution including JIT compilation took 0.505831241607666s.
-[triton-dejavu] First execution including JIT compilation took 0.3260018825531006s.
-[triton-dejavu] First execution including JIT compilation took 0.26883506774902344s.
-[triton-dejavu] First execution including JIT compilation took 0.3750460147857666s.
-[triton-dejavu] First execution including JIT compilation took 0.28055334091186523s.
-[triton-dejavu] First execution including JIT compilation took 0.22944951057434082s.
-[triton-dejavu] First execution including JIT compilation took 0.39789438247680664s.
-[triton-dejavu] First execution including JIT compilation took 0.29082608222961426s.
-[triton-dejavu] First execution including JIT compilation took 0.27058982849121094s.
-[triton-dejavu] First execution including JIT compilation took 0.4739367961883545s.
-[triton-dejavu] First execution including JIT compilation took 0.3220863342285156s.
-[triton-dejavu] First execution including JIT compilation took 0.26685070991516113s.
-[triton-dejavu] First execution including JIT compilation took 0.5877034664154053s.
-[triton-dejavu] First execution including JIT compilation took 0.3485877513885498s.
-[triton-dejavu] First execution including JIT compilation took 0.3099343776702881s.
-[triton-dejavu] First execution including JIT compilation took 0.5288457870483398s.
-[triton-dejavu] First execution including JIT compilation took 0.37487125396728516s.
-[triton-dejavu] First execution including JIT compilation took 0.291426420211792s.
-[triton-dejavu] First execution including JIT compilation took 0.5929708480834961s.
-[triton-dejavu] First execution including JIT compilation took 0.39226531982421875s.
-[triton-dejavu] First execution including JIT compilation took 0.30398011207580566s.
-[triton-dejavu] First execution including JIT compilation took 0.6647982597351074s.
-[triton-dejavu] First execution including JIT compilation took 0.42576146125793457s.
-[triton-dejavu] First execution including JIT compilation took 0.35259008407592773s.
-[triton-dejavu] First execution including JIT compilation took 0.6395070552825928s.
-[triton-dejavu] First execution including JIT compilation took 0.3742716312408447s.
-[triton-dejavu] First execution including JIT compilation took 0.2742881774902344s.
-[triton-dejavu] First execution including JIT compilation took 0.7004520893096924s.
-[triton-dejavu] First execution including JIT compilation took 0.3862569332122803s.
-[triton-dejavu] First execution including JIT compilation took 0.2986783981323242s.
-[triton-dejavu] First execution including JIT compilation took 1.1238834857940674s.
-[triton-dejavu] First execution including JIT compilation took 0.4608597755432129s.
-[triton-dejavu] First execution including JIT compilation took 0.34810447692871094s.
-[triton-dejavu] First execution including JIT compilation took 1.2104661464691162s.
-[triton-dejavu] First execution including JIT compilation took 0.4885828495025635s.
-[triton-dejavu] First execution including JIT compilation took 0.32688188552856445s.
-[triton-dejavu] First execution including JIT compilation took 1.2628145217895508s.
-[triton-dejavu] First execution including JIT compilation took 0.5355226993560791s.
-[triton-dejavu] First execution including JIT compilation took 0.3450770378112793s.
-[triton-dejavu] First execution including JIT compilation took 1.3520443439483643s.
-[triton-dejavu] First execution including JIT compilation took 0.5808374881744385s.
-[triton-dejavu] First execution including JIT compilation took 0.3824028968811035s.
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.3000195026397705s.
-[triton-dejavu] First execution including JIT compilation took 0.6455490589141846s.
-[triton-dejavu] First execution including JIT compilation took 0.37666845321655273s.
-[triton-dejavu] First execution including JIT compilation took 1.3013911247253418s.
-[triton-dejavu] First execution including JIT compilation took 0.6704864501953125s.
-[triton-dejavu] First execution including JIT compilation took 0.4055519104003906s.
-[triton-dejavu] First execution including JIT compilation took 5.386528968811035s.
-[triton-dejavu] First execution including JIT compilation took 1.1679251194000244s.
-[triton-dejavu] First execution including JIT compilation took 0.5064456462860107s.
-[triton-dejavu] First execution including JIT compilation took 5.529943466186523s.
-[triton-dejavu] First execution including JIT compilation took 1.0479810237884521s.
-[triton-dejavu] First execution including JIT compilation took 0.61952805519104s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.3253762722015381s.
-[triton-dejavu] First execution including JIT compilation took 0.25934815406799316s.
-[triton-dejavu] First execution including JIT compilation took 0.21407032012939453s.
-[triton-dejavu] First execution including JIT compilation took 0.3370664119720459s.
-[triton-dejavu] First execution including JIT compilation took 0.28847193717956543s.
-[triton-dejavu] First execution including JIT compilation took 0.23705148696899414s.
-[triton-dejavu] First execution including JIT compilation took 0.3955824375152588s.
-[triton-dejavu] First execution including JIT compilation took 0.3063540458679199s.
-[triton-dejavu] First execution including JIT compilation took 0.25420117378234863s.
-[triton-dejavu] First execution including JIT compilation took 0.4245755672454834s.
-[triton-dejavu] First execution including JIT compilation took 0.32430315017700195s.
-[triton-dejavu] First execution including JIT compilation took 0.269136905670166s.
-[triton-dejavu] First execution including JIT compilation took 0.4198753833770752s.
-[triton-dejavu] First execution including JIT compilation took 0.3307523727416992s.
-[triton-dejavu] First execution including JIT compilation took 0.2702212333679199s.
-[triton-dejavu] First execution including JIT compilation took 0.43901705741882324s.
-[triton-dejavu] First execution including JIT compilation took 0.32950735092163086s.
-[triton-dejavu] First execution including JIT compilation took 0.270906925201416s.
-[triton-dejavu] First execution including JIT compilation took 0.4650428295135498s.
-[triton-dejavu] First execution including JIT compilation took 0.35302281379699707s.
-[triton-dejavu] First execution including JIT compilation took 0.2779378890991211s.
-[triton-dejavu] First execution including JIT compilation took 0.4243738651275635s.
-[triton-dejavu] First execution including JIT compilation took 0.35367321968078613s.
-[triton-dejavu] First execution including JIT compilation took 0.24160146713256836s.
-[triton-dejavu] First execution including JIT compilation took 0.4461050033569336s.
-[triton-dejavu] First execution including JIT compilation took 0.313490629196167s.
-[triton-dejavu] First execution including JIT compilation took 0.25301170349121094s.
-[triton-dejavu] First execution including JIT compilation took 0.49121928215026855s.
-[triton-dejavu] First execution including JIT compilation took 0.34533190727233887s.
-[triton-dejavu] First execution including JIT compilation took 0.2868044376373291s.
-[triton-dejavu] First execution including JIT compilation took 0.5529248714447021s.
-[triton-dejavu] First execution including JIT compilation took 0.36022305488586426s.
-[triton-dejavu] First execution including JIT compilation took 0.2978227138519287s.
-[triton-dejavu] First execution including JIT compilation took 0.5883200168609619s.
-[triton-dejavu] First execution including JIT compilation took 0.3919060230255127s.
-[triton-dejavu] First execution including JIT compilation took 0.3099555969238281s.
-[triton-dejavu] First execution including JIT compilation took 0.6135001182556152s.
-[triton-dejavu] First execution including JIT compilation took 0.4088590145111084s.
-[triton-dejavu] First execution including JIT compilation took 0.3194131851196289s.
-[triton-dejavu] First execution including JIT compilation took 0.7036430835723877s.
-[triton-dejavu] First execution including JIT compilation took 0.45201659202575684s.
-[triton-dejavu] First execution including JIT compilation took 0.3642005920410156s.
-[triton-dejavu] First execution including JIT compilation took 0.6664960384368896s.
-[triton-dejavu] First execution including JIT compilation took 0.5301604270935059s.
-[triton-dejavu] First execution including JIT compilation took 0.28510594367980957s.
-[triton-dejavu] First execution including JIT compilation took 0.6785211563110352s.
-[triton-dejavu] First execution including JIT compilation took 0.4087221622467041s.
-[triton-dejavu] First execution including JIT compilation took 0.3024318218231201s.
-[triton-dejavu] First execution including JIT compilation took 1.157663106918335s.
-[triton-dejavu] First execution including JIT compilation took 0.47170138359069824s.
-[triton-dejavu] First execution including JIT compilation took 0.32757043838500977s.
-[triton-dejavu] First execution including JIT compilation took 1.209216833114624s.
-[triton-dejavu] First execution including JIT compilation took 0.49109315872192383s.
-[triton-dejavu] First execution including JIT compilation took 0.33685898780822754s.
-[triton-dejavu] First execution including JIT compilation took 1.275850772857666s.
-[triton-dejavu] First execution including JIT compilation took 4.450741529464722s.
-[triton-dejavu] First execution including JIT compilation took 0.3839271068572998s.
-[triton-dejavu] First execution including JIT compilation took 1.3883130550384521s.
-[triton-dejavu] First execution including JIT compilation took 0.6234548091888428s.
-[triton-dejavu] First execution including JIT compilation took 0.43517351150512695s.
-bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.4472169876098633s.
-[triton-dejavu] First execution including JIT compilation took 0.6626391410827637s.
-[triton-dejavu] First execution including JIT compilation took 0.42397499084472656s.
-[triton-dejavu] First execution including JIT compilation took 1.535116195678711s.
-[triton-dejavu] First execution including JIT compilation took 0.6942274570465088s.
-[triton-dejavu] First execution including JIT compilation took 0.4427521228790283s.
-[triton-dejavu] First execution including JIT compilation took 5.385616302490234s.
-[triton-dejavu] First execution including JIT compilation took 1.1808125972747803s.
-[triton-dejavu] First execution including JIT compilation took 0.470653772354126s.
-[triton-dejavu] First execution including JIT compilation took 5.212445974349976s.
-[triton-dejavu] First execution including JIT compilation took 1.0829904079437256s.
-[triton-dejavu] First execution including JIT compilation took 0.42938661575317383s.
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.89522385597229s.
-[triton-dejavu] First execution including JIT compilation took 1.1873795986175537s.
-[triton-dejavu] First execution including JIT compilation took 0.5484645366668701s.
-[triton-dejavu] First execution including JIT compilation took 3.8715012073516846s.
-[triton-dejavu] First execution including JIT compilation took 1.5014572143554688s.
-[triton-dejavu] First execution including JIT compilation took 0.5261876583099365s.
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.16099905967712402s.
-[triton-dejavu] First execution including JIT compilation took 0.16479063034057617s.
-[triton-dejavu] First execution including JIT compilation took 0.16184067726135254s.
-[triton-dejavu] First execution including JIT compilation took 0.18947720527648926s.
-[triton-dejavu] First execution including JIT compilation took 0.17523646354675293s.
-[triton-dejavu] First execution including JIT compilation took 0.20532774925231934s.
-[triton-dejavu] First execution including JIT compilation took 0.3007838726043701s.
-[triton-dejavu] First execution including JIT compilation took 0.2070615291595459s.
-[triton-dejavu] First execution including JIT compilation took 0.19569897651672363s.
-[triton-dejavu] First execution including JIT compilation took 0.22059941291809082s.
-[triton-dejavu] First execution including JIT compilation took 0.2129993438720703s.
-[triton-dejavu] First execution including JIT compilation took 0.19128966331481934s.
-[triton-dejavu] First execution including JIT compilation took 0.23188138008117676s.
-[triton-dejavu] First execution including JIT compilation took 0.23681974411010742s.
-[triton-dejavu] First execution including JIT compilation took 0.20053791999816895s.
-[triton-dejavu] First execution including JIT compilation took 0.23547863960266113s.
-[triton-dejavu] First execution including JIT compilation took 0.24089622497558594s.
-[triton-dejavu] First execution including JIT compilation took 0.20086455345153809s.
-[triton-dejavu] First execution including JIT compilation took 0.2673182487487793s.
-[triton-dejavu] First execution including JIT compilation took 0.2323765754699707s.
-[triton-dejavu] First execution including JIT compilation took 0.20696759223937988s.
-[triton-dejavu] First execution including JIT compilation took 0.2162766456604004s.
-[triton-dejavu] First execution including JIT compilation took 0.19398021697998047s.
-[triton-dejavu] First execution including JIT compilation took 0.19721055030822754s.
-[triton-dejavu] First execution including JIT compilation took 0.2597033977508545s.
-[triton-dejavu] First execution including JIT compilation took 0.21506524085998535s.
-[triton-dejavu] First execution including JIT compilation took 0.19923901557922363s.
-[triton-dejavu] First execution including JIT compilation took 0.23490643501281738s.
-[triton-dejavu] First execution including JIT compilation took 0.21814656257629395s.
-[triton-dejavu] First execution including JIT compilation took 0.2114255428314209s.
-[triton-dejavu] First execution including JIT compilation took 0.25490689277648926s.
-[triton-dejavu] First execution including JIT compilation took 0.23802495002746582s.
-[triton-dejavu] First execution including JIT compilation took 0.2209463119506836s.
-[triton-dejavu] First execution including JIT compilation took 0.26262378692626953s.
-[triton-dejavu] First execution including JIT compilation took 0.24033474922180176s.
-[triton-dejavu] First execution including JIT compilation took 0.22833895683288574s.
-[triton-dejavu] First execution including JIT compilation took 0.27592968940734863s.
-[triton-dejavu] First execution including JIT compilation took 0.30753660202026367s.
-[triton-dejavu] First execution including JIT compilation took 0.24009227752685547s.
-[triton-dejavu] First execution including JIT compilation took 0.3198988437652588s.
-[triton-dejavu] First execution including JIT compilation took 0.2536334991455078s.
-[triton-dejavu] First execution including JIT compilation took 0.24361896514892578s.
-[triton-dejavu] First execution including JIT compilation took 0.25406312942504883s.
-[triton-dejavu] First execution including JIT compilation took 0.20447945594787598s.
-[triton-dejavu] First execution including JIT compilation took 0.19324684143066406s.
-[triton-dejavu] First execution including JIT compilation took 0.2655496597290039s.
-[triton-dejavu] First execution including JIT compilation took 0.2157602310180664s.
-[triton-dejavu] First execution including JIT compilation took 0.21718740463256836s.
-[triton-dejavu] First execution including JIT compilation took 0.2685074806213379s.
-[triton-dejavu] First execution including JIT compilation took 0.22944235801696777s.
-[triton-dejavu] First execution including JIT compilation took 0.26235318183898926s.
-[triton-dejavu] First execution including JIT compilation took 0.2784006595611572s.
-[triton-dejavu] First execution including JIT compilation took 0.2505967617034912s.
-[triton-dejavu] First execution including JIT compilation took 0.2174668312072754s.
-[triton-dejavu] First execution including JIT compilation took 0.3032703399658203s.
-[triton-dejavu] First execution including JIT compilation took 0.24589204788208008s.
-[triton-dejavu] First execution including JIT compilation took 0.24153470993041992s.
-[triton-dejavu] First execution including JIT compilation took 0.3123292922973633s.
-[triton-dejavu] First execution including JIT compilation took 0.25666284561157227s.
-[triton-dejavu] First execution including JIT compilation took 0.24495959281921387s.
-[triton-dejavu] First execution including JIT compilation took 0.40181756019592285s.
-[triton-dejavu] First execution including JIT compilation took 0.2874319553375244s.
-[triton-dejavu] First execution including JIT compilation took 0.26238536834716797s.
-[triton-dejavu] First execution including JIT compilation took 0.29593992233276367s.
-[triton-dejavu] First execution including JIT compilation took 0.2483835220336914s.
-[triton-dejavu] First execution including JIT compilation took 0.21873855590820312s.
-[triton-dejavu] First execution including JIT compilation took 0.3240337371826172s.
-[triton-dejavu] First execution including JIT compilation took 0.27186155319213867s.
-[triton-dejavu] First execution including JIT compilation took 0.2367856502532959s.
-[triton-dejavu] First execution including JIT compilation took 0.36632680892944336s.
-[triton-dejavu] First execution including JIT compilation took 0.28855419158935547s.
-[triton-dejavu] First execution including JIT compilation took 0.252063512802124s.
-[triton-dejavu] First execution including JIT compilation took 0.3803398609161377s.
-[triton-dejavu] First execution including JIT compilation took 0.3005537986755371s.
-[triton-dejavu] First execution including JIT compilation took 0.2524375915527344s.
-[triton-dejavu] First execution including JIT compilation took 0.39989233016967773s.
-[triton-dejavu] First execution including JIT compilation took 0.30489444732666016s.
-[triton-dejavu] First execution including JIT compilation took 0.3460044860839844s.
-[triton-dejavu] First execution including JIT compilation took 0.4346785545349121s.
-[triton-dejavu] First execution including JIT compilation took 0.31057190895080566s.
-[triton-dejavu] First execution including JIT compilation took 0.2753105163574219s.
-[triton-dejavu] First execution including JIT compilation took 0.4804239273071289s.
-[triton-dejavu] First execution including JIT compilation took 0.6045436859130859s.
-[triton-dejavu] First execution including JIT compilation took 0.5239622592926025s.
-[triton-dejavu] First execution including JIT compilation took 0.42328929901123047s.
-[triton-dejavu] First execution including JIT compilation took 0.29734230041503906s.
-[triton-dejavu] First execution including JIT compilation took 0.25474023818969727s.
-[triton-dejavu] First execution including JIT compilation took 0.4383995532989502s.
-[triton-dejavu] First execution including JIT compilation took 0.317385196685791s.
-[triton-dejavu] First execution including JIT compilation took 0.2737693786621094s.
-[triton-dejavu] First execution including JIT compilation took 0.5553841590881348s.
-[triton-dejavu] First execution including JIT compilation took 0.35834789276123047s.
-[triton-dejavu] First execution including JIT compilation took 0.33359360694885254s.
-[triton-dejavu] First execution including JIT compilation took 0.6015679836273193s.
-[triton-dejavu] First execution including JIT compilation took 0.3901402950286865s.
-[triton-dejavu] First execution including JIT compilation took 0.5647265911102295s.
-[triton-dejavu] First execution including JIT compilation took 0.6674647331237793s.
-[triton-dejavu] First execution including JIT compilation took 0.4167792797088623s.
-[triton-dejavu] First execution including JIT compilation took 0.3413257598876953s.
-[triton-dejavu] First execution including JIT compilation took 0.7596611976623535s.
-[triton-dejavu] First execution including JIT compilation took 0.43511199951171875s.
-[triton-dejavu] First execution including JIT compilation took 0.3729708194732666s.
-[triton-dejavu] First execution including JIT compilation took 0.7846958637237549s.
-[triton-dejavu] First execution including JIT compilation took 0.6946334838867188s.
-[triton-dejavu] First execution including JIT compilation took 0.433488130569458s.
-[triton-dejavu] First execution including JIT compilation took 0.4143795967102051s.
-[triton-dejavu] First execution including JIT compilation took 0.19273090362548828s.
-[triton-dejavu] First execution including JIT compilation took 0.1894831657409668s.
-[triton-dejavu] First execution including JIT compilation took 0.21772050857543945s.
-[triton-dejavu] First execution including JIT compilation took 0.20870161056518555s.
-[triton-dejavu] First execution including JIT compilation took 0.19836974143981934s.
-[triton-dejavu] First execution including JIT compilation took 0.23154735565185547s.
-[triton-dejavu] First execution including JIT compilation took 0.20255732536315918s.
-[triton-dejavu] First execution including JIT compilation took 0.20589423179626465s.
-[triton-dejavu] First execution including JIT compilation took 0.23143506050109863s.
-[triton-dejavu] First execution including JIT compilation took 0.2238476276397705s.
-[triton-dejavu] First execution including JIT compilation took 0.22137689590454102s.
-[triton-dejavu] First execution including JIT compilation took 0.23688888549804688s.
-[triton-dejavu] First execution including JIT compilation took 0.22747373580932617s.
-[triton-dejavu] First execution including JIT compilation took 0.21170425415039062s.
-[triton-dejavu] First execution including JIT compilation took 0.25104713439941406s.
-[triton-dejavu] First execution including JIT compilation took 0.23560285568237305s.
-[triton-dejavu] First execution including JIT compilation took 0.22189855575561523s.
-[triton-dejavu] First execution including JIT compilation took 0.26841211318969727s.
-[triton-dejavu] First execution including JIT compilation took 0.24857425689697266s.
-[triton-dejavu] First execution including JIT compilation took 0.24297785758972168s.
-[triton-dejavu] First execution including JIT compilation took 0.22948384284973145s.
-[triton-dejavu] First execution including JIT compilation took 0.1988661289215088s.
-[triton-dejavu] First execution including JIT compilation took 0.1896975040435791s.
-[triton-dejavu] First execution including JIT compilation took 0.24746084213256836s.
-[triton-dejavu] First execution including JIT compilation took 0.2185649871826172s.
-[triton-dejavu] First execution including JIT compilation took 0.2041003704071045s.
-[triton-dejavu] First execution including JIT compilation took 0.2650284767150879s.
-[triton-dejavu] First execution including JIT compilation took 0.29841017723083496s.
-[triton-dejavu] First execution including JIT compilation took 0.22010397911071777s.
-[triton-dejavu] First execution including JIT compilation took 0.29136109352111816s.
-[triton-dejavu] First execution including JIT compilation took 0.24800348281860352s.
-[triton-dejavu] First execution including JIT compilation took 0.22483563423156738s.
-[triton-dejavu] First execution including JIT compilation took 0.27678871154785156s.
-[triton-dejavu] First execution including JIT compilation took 0.24335885047912598s.
-[triton-dejavu] First execution including JIT compilation took 0.23481535911560059s.
-[triton-dejavu] First execution including JIT compilation took 0.288956880569458s.
-[triton-dejavu] First execution including JIT compilation took 0.27408528327941895s.
-[triton-dejavu] First execution including JIT compilation took 0.23581624031066895s.
-[triton-dejavu] First execution including JIT compilation took 0.4908866882324219s.
-[triton-dejavu] First execution including JIT compilation took 0.40228939056396484s.
-[triton-dejavu] First execution including JIT compilation took 0.24046826362609863s.
-[triton-dejavu] First execution including JIT compilation took 0.26389145851135254s.
-[triton-dejavu] First execution including JIT compilation took 0.22148394584655762s.
-[triton-dejavu] First execution including JIT compilation took 0.5583405494689941s.
-[triton-dejavu] First execution including JIT compilation took 0.2856779098510742s.
-[triton-dejavu] First execution including JIT compilation took 0.2353372573852539s.
-[triton-dejavu] First execution including JIT compilation took 0.6925959587097168s.
-[triton-dejavu] First execution including JIT compilation took 0.3485393524169922s.
-[triton-dejavu] First execution including JIT compilation took 0.5607750415802002s.
-[triton-dejavu] First execution including JIT compilation took 0.23984003067016602s.
-[triton-dejavu] First execution including JIT compilation took 0.3213932514190674s.
-[triton-dejavu] First execution including JIT compilation took 0.6764676570892334s.
-[triton-dejavu] First execution including JIT compilation took 0.23536252975463867s.
-[triton-dejavu] First execution including JIT compilation took 0.34651637077331543s.
-[triton-dejavu] First execution including JIT compilation took 0.28470325469970703s.
-[triton-dejavu] First execution including JIT compilation took 0.31414175033569336s.
-[triton-dejavu] First execution including JIT compilation took 0.37757158279418945s.
-[triton-dejavu] First execution including JIT compilation took 0.5653142929077148s.
-[triton-dejavu] First execution including JIT compilation took 0.26725172996520996s.
-[triton-dejavu] First execution including JIT compilation took 0.41627001762390137s.
-[triton-dejavu] First execution including JIT compilation took 0.31640172004699707s.
-[triton-dejavu] First execution including JIT compilation took 0.28881263732910156s.
-[triton-dejavu] First execution including JIT compilation took 0.3590090274810791s.
-[triton-dejavu] First execution including JIT compilation took 0.2639732360839844s.
-[triton-dejavu] First execution including JIT compilation took 0.2286384105682373s.
-[triton-dejavu] First execution including JIT compilation took 0.3730354309082031s.
-[triton-dejavu] First execution including JIT compilation took 0.2863786220550537s.
-[triton-dejavu] First execution including JIT compilation took 0.2942509651184082s.
-[triton-dejavu] First execution including JIT compilation took 0.39931154251098633s.
-[triton-dejavu] First execution including JIT compilation took 0.29895520210266113s.
-[triton-dejavu] First execution including JIT compilation took 0.25076842308044434s.
-[triton-dejavu] First execution including JIT compilation took 0.44957423210144043s.
-[triton-dejavu] First execution including JIT compilation took 0.3334476947784424s.
-[triton-dejavu] First execution including JIT compilation took 0.2527899742126465s.
-[triton-dejavu] First execution including JIT compilation took 0.4636514186859131s.
-[triton-dejavu] First execution including JIT compilation took 0.3361804485321045s.
-[triton-dejavu] First execution including JIT compilation took 0.27211809158325195s.
-[triton-dejavu] First execution including JIT compilation took 0.49650144577026367s.
-[triton-dejavu] First execution including JIT compilation took 0.3559887409210205s.
-[triton-dejavu] First execution including JIT compilation took 0.28218793869018555s.
-[triton-dejavu] First execution including JIT compilation took 0.5615448951721191s.
-[triton-dejavu] First execution including JIT compilation took 0.7608671188354492s.
-[triton-dejavu] First execution including JIT compilation took 0.3274657726287842s.
-[triton-dejavu] First execution including JIT compilation took 0.5771064758300781s.
-[triton-dejavu] First execution including JIT compilation took 0.3666727542877197s.
-[triton-dejavu] First execution including JIT compilation took 0.2816441059112549s.
-[triton-dejavu] First execution including JIT compilation took 0.5920066833496094s.
-[triton-dejavu] First execution including JIT compilation took 0.39211297035217285s.
-[triton-dejavu] First execution including JIT compilation took 0.3664851188659668s.
-[triton-dejavu] First execution including JIT compilation took 0.7597804069519043s.
-[triton-dejavu] First execution including JIT compilation took 0.4436652660369873s.
-[triton-dejavu] First execution including JIT compilation took 0.3279578685760498s.
-[triton-dejavu] First execution including JIT compilation took 0.8084597587585449s.
-[triton-dejavu] First execution including JIT compilation took 0.4614524841308594s.
-[triton-dejavu] First execution including JIT compilation took 0.33185672760009766s.
-[triton-dejavu] First execution including JIT compilation took 0.8718667030334473s.
-[triton-dejavu] First execution including JIT compilation took 0.47515869140625s.
-[triton-dejavu] First execution including JIT compilation took 0.34561920166015625s.
-[triton-dejavu] First execution including JIT compilation took 0.9526774883270264s.
-[triton-dejavu] First execution including JIT compilation took 0.5167005062103271s.
-[triton-dejavu] First execution including JIT compilation took 0.36385536193847656s.
-[triton-dejavu] First execution including JIT compilation took 1.0977909564971924s.
-[triton-dejavu] First execution including JIT compilation took 0.5700552463531494s.
-[triton-dejavu] First execution including JIT compilation took 0.39052820205688477s.
-[triton-dejavu] First execution including JIT compilation took 0.24593067169189453s.
-[triton-dejavu] First execution including JIT compilation took 0.20725393295288086s.
-[triton-dejavu] First execution including JIT compilation took 0.20369458198547363s.
-[triton-dejavu] First execution including JIT compilation took 0.2673497200012207s.
-[triton-dejavu] First execution including JIT compilation took 0.22520208358764648s.
-[triton-dejavu] First execution including JIT compilation took 0.2131955623626709s.
-[triton-dejavu] First execution including JIT compilation took 0.2753727436065674s.
-[triton-dejavu] First execution including JIT compilation took 0.23779654502868652s.
-[triton-dejavu] First execution including JIT compilation took 0.2212817668914795s.
-[triton-dejavu] First execution including JIT compilation took 0.28991055488586426s.
-[triton-dejavu] First execution including JIT compilation took 0.25224971771240234s.
-[triton-dejavu] First execution including JIT compilation took 0.22817182540893555s.
-[triton-dejavu] First execution including JIT compilation took 0.2966287136077881s.
-[triton-dejavu] First execution including JIT compilation took 0.25997209548950195s.
-[triton-dejavu] First execution including JIT compilation took 0.24083781242370605s.
-[triton-dejavu] First execution including JIT compilation took 0.3113217353820801s.
-[triton-dejavu] First execution including JIT compilation took 0.2704799175262451s.
-[triton-dejavu] First execution including JIT compilation took 0.24837803840637207s.
-[triton-dejavu] First execution including JIT compilation took 0.3431241512298584s.
-[triton-dejavu] First execution including JIT compilation took 0.28302574157714844s.
-[triton-dejavu] First execution including JIT compilation took 0.27059054374694824s.
-[triton-dejavu] First execution including JIT compilation took 0.28479981422424316s.
-[triton-dejavu] First execution including JIT compilation took 0.23545360565185547s.
-[triton-dejavu] First execution including JIT compilation took 0.21197509765625s.
-[triton-dejavu] First execution including JIT compilation took 0.29216909408569336s.
-[triton-dejavu] First execution including JIT compilation took 0.27100205421447754s.
-[triton-dejavu] First execution including JIT compilation took 0.21901345252990723s.
-[triton-dejavu] First execution including JIT compilation took 0.33516407012939453s.
-[triton-dejavu] First execution including JIT compilation took 0.24909710884094238s.
-[triton-dejavu] First execution including JIT compilation took 0.22092056274414062s.
-[triton-dejavu] First execution including JIT compilation took 0.3268437385559082s.
-[triton-dejavu] First execution including JIT compilation took 0.2536735534667969s.
-[triton-dejavu] First execution including JIT compilation took 0.22658419609069824s.
-[triton-dejavu] First execution including JIT compilation took 0.0029747486114501953s.
-[triton-dejavu] First execution including JIT compilation took 0.22745442390441895s.
-[triton-dejavu] First execution including JIT compilation took 0.20766067504882812s.
-[triton-dejavu] First execution including JIT compilation took 0.28397655487060547s.
-[triton-dejavu] First execution including JIT compilation took 0.239030122756958s.
-[triton-dejavu] First execution including JIT compilation took 0.2599983215332031s.
-[triton-dejavu] First execution including JIT compilation took 0.298583984375s.
-[triton-dejavu] First execution including JIT compilation took 0.2960634231567383s.
-[triton-dejavu] First execution including JIT compilation took 0.27265357971191406s.
-[triton-dejavu] First execution including JIT compilation took 0.3366870880126953s.
-[triton-dejavu] First execution including JIT compilation took 0.26715946197509766s.
-[triton-dejavu] First execution including JIT compilation took 0.22327065467834473s.
-[triton-dejavu] First execution including JIT compilation took 0.35770249366760254s.
-[triton-dejavu] First execution including JIT compilation took 0.28089475631713867s.
-[triton-dejavu] First execution including JIT compilation took 0.24740338325500488s.
-[triton-dejavu] First execution including JIT compilation took 0.39159536361694336s.
-[triton-dejavu] First execution including JIT compilation took 0.30934739112854004s.
-[triton-dejavu] First execution including JIT compilation took 0.31633591651916504s.
-[triton-dejavu] First execution including JIT compilation took 0.47846007347106934s.
-[triton-dejavu] First execution including JIT compilation took 0.31675219535827637s.
-[triton-dejavu] First execution including JIT compilation took 0.2682924270629883s.
-[triton-dejavu] First execution including JIT compilation took 0.4402353763580322s.
-[triton-dejavu] First execution including JIT compilation took 0.32269787788391113s.
-[triton-dejavu] First execution including JIT compilation took 0.2709183692932129s.
-[triton-dejavu] First execution including JIT compilation took 0.4444851875305176s.
-[triton-dejavu] First execution including JIT compilation took 0.3340129852294922s.
-[triton-dejavu] First execution including JIT compilation took 0.2642478942871094s.
-[triton-dejavu] First execution including JIT compilation took 0.5187058448791504s.
-[triton-dejavu] First execution including JIT compilation took 0.345888614654541s.
-[triton-dejavu] First execution including JIT compilation took 0.2869832515716553s.
-[triton-dejavu] First execution including JIT compilation took 0.4653136730194092s.
-[triton-dejavu] First execution including JIT compilation took 0.318464994430542s.
-[triton-dejavu] First execution including JIT compilation took 0.2446439266204834s.
-[triton-dejavu] First execution including JIT compilation took 0.48433685302734375s.
-[triton-dejavu] First execution including JIT compilation took 0.33881640434265137s.
-[triton-dejavu] First execution including JIT compilation took 0.2716357707977295s.
-[triton-dejavu] First execution including JIT compilation took 0.5825600624084473s.
-[triton-dejavu] First execution including JIT compilation took 0.3634026050567627s.
-[triton-dejavu] First execution including JIT compilation took 0.2927565574645996s.
-[triton-dejavu] First execution including JIT compilation took 0.6513199806213379s.
-[triton-dejavu] First execution including JIT compilation took 0.39306163787841797s.
-[triton-dejavu] First execution including JIT compilation took 0.3288865089416504s.
-[triton-dejavu] First execution including JIT compilation took 0.6803631782531738s.
-[triton-dejavu] First execution including JIT compilation took 0.4358654022216797s.
-[triton-dejavu] First execution including JIT compilation took 0.3263130187988281s.
-[triton-dejavu] First execution including JIT compilation took 0.7428200244903564s.
-[triton-dejavu] First execution including JIT compilation took 0.4704313278198242s.
-[triton-dejavu] First execution including JIT compilation took 0.3472471237182617s.
-[triton-dejavu] First execution including JIT compilation took 0.8439326286315918s.
-[triton-dejavu] First execution including JIT compilation took 0.5137937068939209s.
-[triton-dejavu] First execution including JIT compilation took 0.37453126907348633s.
-[triton-dejavu] First execution including JIT compilation took 0.8335433006286621s.
-[triton-dejavu] First execution including JIT compilation took 0.49039268493652344s.
-[triton-dejavu] First execution including JIT compilation took 0.33686327934265137s.
-[triton-dejavu] First execution including JIT compilation took 0.8961453437805176s.
-[triton-dejavu] First execution including JIT compilation took 0.4983179569244385s.
-[triton-dejavu] First execution including JIT compilation took 0.35771870613098145s.
-[triton-dejavu] First execution including JIT compilation took 1.4264824390411377s.
-[triton-dejavu] First execution including JIT compilation took 0.590933084487915s.
-[triton-dejavu] First execution including JIT compilation took 0.37283968925476074s.
-[triton-dejavu] First execution including JIT compilation took 1.513688564300537s.
-[triton-dejavu] First execution including JIT compilation took 0.6336290836334229s.
-[triton-dejavu] First execution including JIT compilation took 0.3925764560699463s.
-[triton-dejavu] First execution including JIT compilation took 1.6222319602966309s.
-[triton-dejavu] First execution including JIT compilation took 0.6844735145568848s.
-[triton-dejavu] First execution including JIT compilation took 0.4293532371520996s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.27669787406921387s.
-[triton-dejavu] First execution including JIT compilation took 0.2200946807861328s.
-[triton-dejavu] First execution including JIT compilation took 0.20387721061706543s.
-[triton-dejavu] First execution including JIT compilation took 0.28987956047058105s.
-[triton-dejavu] First execution including JIT compilation took 0.2356255054473877s.
-[triton-dejavu] First execution including JIT compilation took 0.27132534980773926s.
-[triton-dejavu] First execution including JIT compilation took 0.32961010932922363s.
-[triton-dejavu] First execution including JIT compilation took 0.30097293853759766s.
-[triton-dejavu] First execution including JIT compilation took 0.19495487213134766s.
-[triton-dejavu] First execution including JIT compilation took 0.3256947994232178s.
-[triton-dejavu] First execution including JIT compilation took 0.2637317180633545s.
-[triton-dejavu] First execution including JIT compilation took 0.20687651634216309s.
-[triton-dejavu] First execution including JIT compilation took 0.31104588508605957s.
-[triton-dejavu] First execution including JIT compilation took 0.23851871490478516s.
-[triton-dejavu] First execution including JIT compilation took 0.21181392669677734s.
-[triton-dejavu] First execution including JIT compilation took 0.31918883323669434s.
-[triton-dejavu] First execution including JIT compilation took 0.26523566246032715s.
-[triton-dejavu] First execution including JIT compilation took 0.24065852165222168s.
-[triton-dejavu] First execution including JIT compilation took 0.4031362533569336s.
-[triton-dejavu] First execution including JIT compilation took 0.32692384719848633s.
-[triton-dejavu] First execution including JIT compilation took 0.30884742736816406s.
-[triton-dejavu] First execution including JIT compilation took 0.36347198486328125s.
-[triton-dejavu] First execution including JIT compilation took 0.275341272354126s.
-[triton-dejavu] First execution including JIT compilation took 0.22562313079833984s.
-[triton-dejavu] First execution including JIT compilation took 0.3837006092071533s.
-[triton-dejavu] First execution including JIT compilation took 0.28661417961120605s.
-[triton-dejavu] First execution including JIT compilation took 0.2673346996307373s.
-[triton-dejavu] First execution including JIT compilation took 0.42246246337890625s.
-[triton-dejavu] First execution including JIT compilation took 0.3161001205444336s.
-[triton-dejavu] First execution including JIT compilation took 0.25901246070861816s.
-[triton-dejavu] First execution including JIT compilation took 0.4973328113555908s.
-[triton-dejavu] First execution including JIT compilation took 0.33356618881225586s.
-[triton-dejavu] First execution including JIT compilation took 0.27872180938720703s.
-[triton-dejavu] First execution including JIT compilation took 0.46326756477355957s.
-[triton-dejavu] First execution including JIT compilation took 0.35817837715148926s.
-[triton-dejavu] First execution including JIT compilation took 0.2817208766937256s.
-[triton-dejavu] First execution including JIT compilation took 0.49773097038269043s.
-[triton-dejavu] First execution including JIT compilation took 0.3602900505065918s.
-[triton-dejavu] First execution including JIT compilation took 0.3025212287902832s.
-[triton-dejavu] First execution including JIT compilation took 0.5235435962677002s.
-[triton-dejavu] First execution including JIT compilation took 0.3942751884460449s.
-[triton-dejavu] First execution including JIT compilation took 0.3084683418273926s.
-[triton-dejavu] First execution including JIT compilation took 0.4975898265838623s.
-[triton-dejavu] First execution including JIT compilation took 0.3797109127044678s.
-[triton-dejavu] First execution including JIT compilation took 0.30298733711242676s.
-[triton-dejavu] First execution including JIT compilation took 0.5086245536804199s.
-[triton-dejavu] First execution including JIT compilation took 0.3442721366882324s.
-[triton-dejavu] First execution including JIT compilation took 0.2747983932495117s.
-[triton-dejavu] First execution including JIT compilation took 0.5613915920257568s.
-[triton-dejavu] First execution including JIT compilation took 0.44350624084472656s.
-[triton-dejavu] First execution including JIT compilation took 0.28230857849121094s.
-[triton-dejavu] First execution including JIT compilation took 0.6058351993560791s.
-[triton-dejavu] First execution including JIT compilation took 0.38971829414367676s.
-[triton-dejavu] First execution including JIT compilation took 0.3060598373413086s.
-[triton-dejavu] First execution including JIT compilation took 0.6243553161621094s.
-[triton-dejavu] First execution including JIT compilation took 0.4053328037261963s.
-[triton-dejavu] First execution including JIT compilation took 0.3107168674468994s.
-[triton-dejavu] First execution including JIT compilation took 0.6879723072052002s.
-[triton-dejavu] First execution including JIT compilation took 0.4307887554168701s.
-[triton-dejavu] First execution including JIT compilation took 0.3229548931121826s.
-[triton-dejavu] First execution including JIT compilation took 0.7918972969055176s.
-[triton-dejavu] First execution including JIT compilation took 0.48616766929626465s.
-[triton-dejavu] First execution including JIT compilation took 0.36690473556518555s.
-[triton-dejavu] First execution including JIT compilation took 0.8196022510528564s.
-[triton-dejavu] First execution including JIT compilation took 0.4654698371887207s.
-[triton-dejavu] First execution including JIT compilation took 0.3202695846557617s.
-[triton-dejavu] First execution including JIT compilation took 0.7949032783508301s.
-[triton-dejavu] First execution including JIT compilation took 0.4816138744354248s.
-[triton-dejavu] First execution including JIT compilation took 0.3490898609161377s.
-[triton-dejavu] First execution including JIT compilation took 1.313990831375122s.
-[triton-dejavu] First execution including JIT compilation took 0.6957395076751709s.
-[triton-dejavu] First execution including JIT compilation took 0.36670374870300293s.
-[triton-dejavu] First execution including JIT compilation took 1.390000581741333s.
-[triton-dejavu] First execution including JIT compilation took 0.5718057155609131s.
-[triton-dejavu] First execution including JIT compilation took 0.4198739528656006s.
-[triton-dejavu] First execution including JIT compilation took 1.4662723541259766s.
-[triton-dejavu] First execution including JIT compilation took 0.6175658702850342s.
-[triton-dejavu] First execution including JIT compilation took 0.398082971572876s.
-[triton-dejavu] First execution including JIT compilation took 1.5167500972747803s.
-[triton-dejavu] First execution including JIT compilation took 0.6479494571685791s.
-[triton-dejavu] First execution including JIT compilation took 0.41569066047668457s.
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.4222400188446045s.
-[triton-dejavu] First execution including JIT compilation took 0.5973012447357178s.
-[triton-dejavu] First execution including JIT compilation took 0.3583037853240967s.
-[triton-dejavu] First execution including JIT compilation took 1.356217861175537s.
-[triton-dejavu] First execution including JIT compilation took 0.6360659599304199s.
-[triton-dejavu] First execution including JIT compilation took 0.4930713176727295s.
-[triton-dejavu] First execution including JIT compilation took 5.760070323944092s.
-[triton-dejavu] First execution including JIT compilation took 1.333890438079834s.
-[triton-dejavu] First execution including JIT compilation took 0.5843362808227539s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.4048187732696533s.
-[triton-dejavu] First execution including JIT compilation took 0.2757749557495117s.
-[triton-dejavu] First execution including JIT compilation took 0.22593021392822266s.
-[triton-dejavu] First execution including JIT compilation took 0.3719668388366699s.
-[triton-dejavu] First execution including JIT compilation took 0.2907881736755371s.
-[triton-dejavu] First execution including JIT compilation took 0.22260475158691406s.
-[triton-dejavu] First execution including JIT compilation took 0.3940160274505615s.
-[triton-dejavu] First execution including JIT compilation took 0.29627037048339844s.
-[triton-dejavu] First execution including JIT compilation took 0.23922204971313477s.
-[triton-dejavu] First execution including JIT compilation took 0.4309115409851074s.
-[triton-dejavu] First execution including JIT compilation took 0.3773322105407715s.
-[triton-dejavu] First execution including JIT compilation took 0.31682252883911133s.
-[triton-dejavu] First execution including JIT compilation took 0.4817657470703125s.
-[triton-dejavu] First execution including JIT compilation took 0.30999183654785156s.
-[triton-dejavu] First execution including JIT compilation took 0.2649409770965576s.
-[triton-dejavu] First execution including JIT compilation took 0.4654359817504883s.
-[triton-dejavu] First execution including JIT compilation took 0.3404858112335205s.
-[triton-dejavu] First execution including JIT compilation took 0.2549777030944824s.
-[triton-dejavu] First execution including JIT compilation took 0.5529801845550537s.
-[triton-dejavu] First execution including JIT compilation took 0.39357733726501465s.
-[triton-dejavu] First execution including JIT compilation took 0.2779700756072998s.
-[triton-dejavu] First execution including JIT compilation took 0.4679603576660156s.
-[triton-dejavu] First execution including JIT compilation took 0.3258848190307617s.
-[triton-dejavu] First execution including JIT compilation took 0.22054314613342285s.
-[triton-dejavu] First execution including JIT compilation took 0.5082552433013916s.
-[triton-dejavu] First execution including JIT compilation took 0.33693814277648926s.
-[triton-dejavu] First execution including JIT compilation took 0.2745835781097412s.
-[triton-dejavu] First execution including JIT compilation took 0.5847163200378418s.
-[triton-dejavu] First execution including JIT compilation took 0.33575940132141113s.
-[triton-dejavu] First execution including JIT compilation took 0.3060939311981201s.
-[triton-dejavu] First execution including JIT compilation took 0.5239126682281494s.
-[triton-dejavu] First execution including JIT compilation took 0.35081052780151367s.
-[triton-dejavu] First execution including JIT compilation took 0.2809262275695801s.
-[triton-dejavu] First execution including JIT compilation took 0.5589377880096436s.
-[triton-dejavu] First execution including JIT compilation took 0.36190342903137207s.
-[triton-dejavu] First execution including JIT compilation took 0.27885007858276367s.
-[triton-dejavu] First execution including JIT compilation took 0.6101348400115967s.
-[triton-dejavu] First execution including JIT compilation took 0.4172549247741699s.
-[triton-dejavu] First execution including JIT compilation took 0.3286736011505127s.
-[triton-dejavu] First execution including JIT compilation took 0.6457531452178955s.
-[triton-dejavu] First execution including JIT compilation took 0.39678049087524414s.
-[triton-dejavu] First execution including JIT compilation took 0.30982041358947754s.
-[triton-dejavu] First execution including JIT compilation took 0.6744742393493652s.
-[triton-dejavu] First execution including JIT compilation took 0.42897796630859375s.
-[triton-dejavu] First execution including JIT compilation took 0.26523256301879883s.
-[triton-dejavu] First execution including JIT compilation took 0.7656145095825195s.
-[triton-dejavu] First execution including JIT compilation took 0.40720272064208984s.
-[triton-dejavu] First execution including JIT compilation took 0.28449296951293945s.
-[triton-dejavu] First execution including JIT compilation took 1.1293773651123047s.
-[triton-dejavu] First execution including JIT compilation took 0.5252115726470947s.
-[triton-dejavu] First execution including JIT compilation took 0.3610687255859375s.
-[triton-dejavu] First execution including JIT compilation took 1.4119246006011963s.
-[triton-dejavu] First execution including JIT compilation took 0.6041393280029297s.
-[triton-dejavu] First execution including JIT compilation took 0.3884885311126709s.
-[triton-dejavu] First execution including JIT compilation took 1.526637315750122s.
-[triton-dejavu] First execution including JIT compilation took 0.6266424655914307s.
-[triton-dejavu] First execution including JIT compilation took 0.40192389488220215s.
-[triton-dejavu] First execution including JIT compilation took 1.530601978302002s.
-[triton-dejavu] First execution including JIT compilation took 0.6515090465545654s.
-[triton-dejavu] First execution including JIT compilation took 0.42690610885620117s.
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.7079706192016602s.
-[triton-dejavu] First execution including JIT compilation took 0.7440791130065918s.
-[triton-dejavu] First execution including JIT compilation took 0.4444162845611572s.
-[triton-dejavu] First execution including JIT compilation took 1.7328886985778809s.
-[triton-dejavu] First execution including JIT compilation took 0.7971758842468262s.
-[triton-dejavu] First execution including JIT compilation took 0.47760677337646484s.
-[triton-dejavu] First execution including JIT compilation took 5.828885316848755s.
-[triton-dejavu] First execution including JIT compilation took 1.288949966430664s.
-[triton-dejavu] First execution including JIT compilation took 0.5151238441467285s.
-[triton-dejavu] First execution including JIT compilation took 5.60798192024231s.
-[triton-dejavu] First execution including JIT compilation took 1.3259506225585938s.
-[triton-dejavu] First execution including JIT compilation took 0.5839335918426514s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 3.490701913833618s.
-[triton-dejavu] First execution including JIT compilation took 1.3819916248321533s.
-[triton-dejavu] First execution including JIT compilation took 0.681626558303833s.
-[triton-dejavu] First execution including JIT compilation took 4.5987389087677s.
-[triton-dejavu] First execution including JIT compilation took 1.3767080307006836s.
-[triton-dejavu] First execution including JIT compilation took 0.6134452819824219s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.20271754264831543s.
-[triton-dejavu] First execution including JIT compilation took 0.17133116722106934s.
-[triton-dejavu] First execution including JIT compilation took 0.1536731719970703s.
-[triton-dejavu] First execution including JIT compilation took 0.19096851348876953s.
-[triton-dejavu] First execution including JIT compilation took 0.17846441268920898s.
-[triton-dejavu] First execution including JIT compilation took 0.1891782283782959s.
-[triton-dejavu] First execution including JIT compilation took 0.23286938667297363s.
-[triton-dejavu] First execution including JIT compilation took 0.21250367164611816s.
-[triton-dejavu] First execution including JIT compilation took 0.2115018367767334s.
-[triton-dejavu] First execution including JIT compilation took 0.25536513328552246s.
-[triton-dejavu] First execution including JIT compilation took 0.26403188705444336s.
-[triton-dejavu] First execution including JIT compilation took 0.20061635971069336s.
-[triton-dejavu] First execution including JIT compilation took 0.27070093154907227s.
-[triton-dejavu] First execution including JIT compilation took 0.2410728931427002s.
-[triton-dejavu] First execution including JIT compilation took 0.2050936222076416s.
-[triton-dejavu] First execution including JIT compilation took 0.28530001640319824s.
-[triton-dejavu] First execution including JIT compilation took 0.24301719665527344s.
-[triton-dejavu] First execution including JIT compilation took 0.2149193286895752s.
-[triton-dejavu] First execution including JIT compilation took 0.29409146308898926s.
-[triton-dejavu] First execution including JIT compilation took 0.2827568054199219s.
-[triton-dejavu] First execution including JIT compilation took 0.2236497402191162s.
-[triton-dejavu] First execution including JIT compilation took 0.2511780261993408s.
-[triton-dejavu] First execution including JIT compilation took 0.21310901641845703s.
-[triton-dejavu] First execution including JIT compilation took 0.19583463668823242s.
-[triton-dejavu] First execution including JIT compilation took 0.270064115524292s.
-[triton-dejavu] First execution including JIT compilation took 0.2317502498626709s.
-[triton-dejavu] First execution including JIT compilation took 0.2144770622253418s.
-[triton-dejavu] First execution including JIT compilation took 0.28452420234680176s.
-[triton-dejavu] First execution including JIT compilation took 0.24449563026428223s.
-[triton-dejavu] First execution including JIT compilation took 0.2711045742034912s.
-[triton-dejavu] First execution including JIT compilation took 0.30714941024780273s.
-[triton-dejavu] First execution including JIT compilation took 0.2577242851257324s.
-[triton-dejavu] First execution including JIT compilation took 0.23275232315063477s.
-[triton-dejavu] First execution including JIT compilation took 0.32830142974853516s.
-[triton-dejavu] First execution including JIT compilation took 0.25277233123779297s.
-[triton-dejavu] First execution including JIT compilation took 0.23861432075500488s.
-[triton-dejavu] First execution including JIT compilation took 0.31818604469299316s.
-[triton-dejavu] First execution including JIT compilation took 0.26758432388305664s.
-[triton-dejavu] First execution including JIT compilation took 0.2486262321472168s.
-[triton-dejavu] First execution including JIT compilation took 0.3456125259399414s.
-[triton-dejavu] First execution including JIT compilation took 0.33374500274658203s.
-[triton-dejavu] First execution including JIT compilation took 0.2485215663909912s.
-[triton-dejavu] First execution including JIT compilation took 0.30871033668518066s.
-[triton-dejavu] First execution including JIT compilation took 0.22252321243286133s.
-[triton-dejavu] First execution including JIT compilation took 0.20645499229431152s.
-[triton-dejavu] First execution including JIT compilation took 0.3251798152923584s.
-[triton-dejavu] First execution including JIT compilation took 0.2487037181854248s.
-[triton-dejavu] First execution including JIT compilation took 0.22485733032226562s.
-[triton-dejavu] First execution including JIT compilation took 0.33643627166748047s.
-[triton-dejavu] First execution including JIT compilation took 0.2661266326904297s.
-[triton-dejavu] First execution including JIT compilation took 0.2295246124267578s.
-[triton-dejavu] First execution including JIT compilation took 0.38455843925476074s.
-[triton-dejavu] First execution including JIT compilation took 0.2738194465637207s.
-[triton-dejavu] First execution including JIT compilation took 0.24585938453674316s.
-[triton-dejavu] First execution including JIT compilation took 0.4033064842224121s.
-[triton-dejavu] First execution including JIT compilation took 0.2825932502746582s.
-[triton-dejavu] First execution including JIT compilation took 0.2537994384765625s.
-[triton-dejavu] First execution including JIT compilation took 0.4199497699737549s.
-[triton-dejavu] First execution including JIT compilation took 0.2951374053955078s.
-[triton-dejavu] First execution including JIT compilation took 0.2603449821472168s.
-[triton-dejavu] First execution including JIT compilation took 0.4944791793823242s.
-[triton-dejavu] First execution including JIT compilation took 0.3230445384979248s.
-[triton-dejavu] First execution including JIT compilation took 0.29880690574645996s.
-[triton-dejavu] First execution including JIT compilation took 0.4109377861022949s.
-[triton-dejavu] First execution including JIT compilation took 0.27936363220214844s.
-[triton-dejavu] First execution including JIT compilation took 0.23092174530029297s.
-[triton-dejavu] First execution including JIT compilation took 0.428159236907959s.
-[triton-dejavu] First execution including JIT compilation took 0.2879374027252197s.
-[triton-dejavu] First execution including JIT compilation took 0.2565889358520508s.
-[triton-dejavu] First execution including JIT compilation took 0.5160079002380371s.
-[triton-dejavu] First execution including JIT compilation took 0.31639909744262695s.
-[triton-dejavu] First execution including JIT compilation took 0.2591373920440674s.
-[triton-dejavu] First execution including JIT compilation took 0.5452303886413574s.
-[triton-dejavu] First execution including JIT compilation took 0.3242976665496826s.
-[triton-dejavu] First execution including JIT compilation took 0.2623326778411865s.
-[triton-dejavu] First execution including JIT compilation took 0.5922431945800781s.
-[triton-dejavu] First execution including JIT compilation took 0.34310412406921387s.
-[triton-dejavu] First execution including JIT compilation took 0.27410078048706055s.
-[triton-dejavu] First execution including JIT compilation took 0.6470179557800293s.
-[triton-dejavu] First execution including JIT compilation took 0.3510866165161133s.
-[triton-dejavu] First execution including JIT compilation took 0.2860851287841797s.
-[triton-dejavu] First execution including JIT compilation took 0.7252414226531982s.
-[triton-dejavu] First execution including JIT compilation took 0.4050569534301758s.
-[triton-dejavu] First execution including JIT compilation took 0.30899977684020996s.
-[triton-dejavu] First execution including JIT compilation took 0.6849832534790039s.
-[triton-dejavu] First execution including JIT compilation took 0.4114108085632324s.
-[triton-dejavu] First execution including JIT compilation took 0.26646900177001953s.
-[triton-dejavu] First execution including JIT compilation took 0.731346845626831s.
-[triton-dejavu] First execution including JIT compilation took 0.37401604652404785s.
-[triton-dejavu] First execution including JIT compilation took 0.2981090545654297s.
-[triton-dejavu] First execution including JIT compilation took 1.255406141281128s.
-[triton-dejavu] First execution including JIT compilation took 0.4646761417388916s.
-[triton-dejavu] First execution including JIT compilation took 0.31566929817199707s.
-[triton-dejavu] First execution including JIT compilation took 1.3867464065551758s.
-[triton-dejavu] First execution including JIT compilation took 0.4915659427642822s.
-[triton-dejavu] First execution including JIT compilation took 0.336292028427124s.
-[triton-dejavu] First execution including JIT compilation took 1.4543449878692627s.
-[triton-dejavu] First execution including JIT compilation took 0.5164680480957031s.
-[triton-dejavu] First execution including JIT compilation took 0.35480237007141113s.
-[triton-dejavu] First execution including JIT compilation took 1.5047898292541504s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.2522404193878174s.
-[triton-dejavu] First execution including JIT compilation took 0.20596766471862793s.
-[triton-dejavu] First execution including JIT compilation took 0.19162297248840332s.
-[triton-dejavu] First execution including JIT compilation took 0.26123833656311035s.
-[triton-dejavu] First execution including JIT compilation took 0.22704195976257324s.
-[triton-dejavu] First execution including JIT compilation took 0.2119007110595703s.
-[triton-dejavu] First execution including JIT compilation took 0.29358959197998047s.
-[triton-dejavu] First execution including JIT compilation took 0.23833417892456055s.
-[triton-dejavu] First execution including JIT compilation took 0.2248821258544922s.
-[triton-dejavu] First execution including JIT compilation took 0.29287123680114746s.
-[triton-dejavu] First execution including JIT compilation took 0.25087571144104004s.
-[triton-dejavu] First execution including JIT compilation took 0.23653793334960938s.
-[triton-dejavu] First execution including JIT compilation took 0.3085510730743408s.
-[triton-dejavu] First execution including JIT compilation took 0.22148561477661133s.
-[triton-dejavu] First execution including JIT compilation took 0.20212173461914062s.
-[triton-dejavu] First execution including JIT compilation took 0.26024508476257324s.
-[triton-dejavu] First execution including JIT compilation took 0.20862603187561035s.
-[triton-dejavu] First execution including JIT compilation took 0.2089087963104248s.
-[triton-dejavu] First execution including JIT compilation took 0.28104591369628906s.
-[triton-dejavu] First execution including JIT compilation took 0.2973470687866211s.
-[triton-dejavu] First execution including JIT compilation took 0.20228052139282227s.
-[triton-dejavu] First execution including JIT compilation took 0.3051731586456299s.
-[triton-dejavu] First execution including JIT compilation took 0.21213650703430176s.
-[triton-dejavu] First execution including JIT compilation took 0.18593811988830566s.
-[triton-dejavu] First execution including JIT compilation took 0.25663161277770996s.
-[triton-dejavu] First execution including JIT compilation took 0.2507617473602295s.
-[triton-dejavu] First execution including JIT compilation took 0.2245655059814453s.
-[triton-dejavu] First execution including JIT compilation took 0.3247964382171631s.
-[triton-dejavu] First execution including JIT compilation took 0.25675010681152344s.
-[triton-dejavu] First execution including JIT compilation took 0.2267463207244873s.
-[triton-dejavu] First execution including JIT compilation took 0.3354456424713135s.
-[triton-dejavu] First execution including JIT compilation took 0.25986337661743164s.
-[triton-dejavu] First execution including JIT compilation took 0.23006272315979004s.
-[triton-dejavu] First execution including JIT compilation took 0.002689838409423828s.
-[triton-dejavu] First execution including JIT compilation took 0.2550840377807617s.
-[triton-dejavu] First execution including JIT compilation took 0.24457740783691406s.
-[triton-dejavu] First execution including JIT compilation took 0.34949541091918945s.
-[triton-dejavu] First execution including JIT compilation took 0.2756638526916504s.
-[triton-dejavu] First execution including JIT compilation took 0.2527437210083008s.
-[triton-dejavu] First execution including JIT compilation took 0.3787045478820801s.
-[triton-dejavu] First execution including JIT compilation took 0.2873713970184326s.
-[triton-dejavu] First execution including JIT compilation took 0.2592127323150635s.
-[triton-dejavu] First execution including JIT compilation took 0.33814334869384766s.
-[triton-dejavu] First execution including JIT compilation took 0.2517428398132324s.
-[triton-dejavu] First execution including JIT compilation took 0.21767520904541016s.
-[triton-dejavu] First execution including JIT compilation took 0.36879444122314453s.
-[triton-dejavu] First execution including JIT compilation took 0.2698078155517578s.
-[triton-dejavu] First execution including JIT compilation took 0.2365264892578125s.
-[triton-dejavu] First execution including JIT compilation took 0.47873687744140625s.
-[triton-dejavu] First execution including JIT compilation took 0.2871267795562744s.
-[triton-dejavu] First execution including JIT compilation took 0.24500083923339844s.
-[triton-dejavu] First execution including JIT compilation took 0.4963796138763428s.
-[triton-dejavu] First execution including JIT compilation took 0.30948710441589355s.
-[triton-dejavu] First execution including JIT compilation took 0.25137853622436523s.
-[triton-dejavu] First execution including JIT compilation took 0.4584636688232422s.
-[triton-dejavu] First execution including JIT compilation took 0.3162257671356201s.
-[triton-dejavu] First execution including JIT compilation took 0.2994105815887451s.
-[triton-dejavu] First execution including JIT compilation took 0.4786410331726074s.
-[triton-dejavu] First execution including JIT compilation took 0.3190131187438965s.
-[triton-dejavu] First execution including JIT compilation took 0.2704010009765625s.
-[triton-dejavu] First execution including JIT compilation took 0.5149548053741455s.
-[triton-dejavu] First execution including JIT compilation took 0.3378560543060303s.
-[triton-dejavu] First execution including JIT compilation took 0.28589439392089844s.
-[triton-dejavu] First execution including JIT compilation took 0.47048139572143555s.
-[triton-dejavu] First execution including JIT compilation took 0.28485631942749023s.
-[triton-dejavu] First execution including JIT compilation took 0.23804211616516113s.
-[triton-dejavu] First execution including JIT compilation took 0.4914519786834717s.
-[triton-dejavu] First execution including JIT compilation took 0.30657291412353516s.
-[triton-dejavu] First execution including JIT compilation took 0.2527627944946289s.
-[triton-dejavu] First execution including JIT compilation took 0.7375938892364502s.
-[triton-dejavu] First execution including JIT compilation took 0.33797788619995117s.
-[triton-dejavu] First execution including JIT compilation took 0.27010035514831543s.
-[triton-dejavu] First execution including JIT compilation took 0.6535592079162598s.
-[triton-dejavu] First execution including JIT compilation took 0.35745692253112793s.
-[triton-dejavu] First execution including JIT compilation took 0.2837181091308594s.
-[triton-dejavu] First execution including JIT compilation took 0.6975975036621094s.
-[triton-dejavu] First execution including JIT compilation took 0.4016244411468506s.
-[triton-dejavu] First execution including JIT compilation took 0.30004215240478516s.
-[triton-dejavu] First execution including JIT compilation took 0.7542321681976318s.
-[triton-dejavu] First execution including JIT compilation took 0.4112386703491211s.
-[triton-dejavu] First execution including JIT compilation took 0.3136770725250244s.
-[triton-dejavu] First execution including JIT compilation took 0.854001522064209s.
-[triton-dejavu] First execution including JIT compilation took 0.5444228649139404s.
-[triton-dejavu] First execution including JIT compilation took 0.34048891067504883s.
-[triton-dejavu] First execution including JIT compilation took 0.8623373508453369s.
-[triton-dejavu] First execution including JIT compilation took 0.4289731979370117s.
-[triton-dejavu] First execution including JIT compilation took 0.2985663414001465s.
-[triton-dejavu] First execution including JIT compilation took 0.8375389575958252s.
-[triton-dejavu] First execution including JIT compilation took 0.4361135959625244s.
-[triton-dejavu] First execution including JIT compilation took 0.3154265880584717s.
-[triton-dejavu] First execution including JIT compilation took 1.547863483428955s.
-[triton-dejavu] First execution including JIT compilation took 0.511976957321167s.
-[triton-dejavu] First execution including JIT compilation took 0.33377766609191895s.
-[triton-dejavu] First execution including JIT compilation took 1.6245229244232178s.
-[triton-dejavu] First execution including JIT compilation took 0.5486083030700684s.
-[triton-dejavu] First execution including JIT compilation took 0.35009217262268066s.
-[triton-dejavu] First execution including JIT compilation took 1.730604887008667s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.31005024909973145s.
-[triton-dejavu] First execution including JIT compilation took 0.2568016052246094s.
-[triton-dejavu] First execution including JIT compilation took 0.21983957290649414s.
-[triton-dejavu] First execution including JIT compilation took 0.3342258930206299s.
-[triton-dejavu] First execution including JIT compilation took 0.2613508701324463s.
-[triton-dejavu] First execution including JIT compilation took 0.23277544975280762s.
-[triton-dejavu] First execution including JIT compilation took 0.34593868255615234s.
-[triton-dejavu] First execution including JIT compilation took 0.27214527130126953s.
-[triton-dejavu] First execution including JIT compilation took 0.24357295036315918s.
-[triton-dejavu] First execution including JIT compilation took 0.3831825256347656s.
-[triton-dejavu] First execution including JIT compilation took 0.2801399230957031s.
-[triton-dejavu] First execution including JIT compilation took 0.28713178634643555s.
-[triton-dejavu] First execution including JIT compilation took 0.3746922016143799s.
-[triton-dejavu] First execution including JIT compilation took 0.29146361351013184s.
-[triton-dejavu] First execution including JIT compilation took 0.25294995307922363s.
-[triton-dejavu] First execution including JIT compilation took 0.3896350860595703s.
-[triton-dejavu] First execution including JIT compilation took 0.3028104305267334s.
-[triton-dejavu] First execution including JIT compilation took 0.2598695755004883s.
-[triton-dejavu] First execution including JIT compilation took 0.4107673168182373s.
-[triton-dejavu] First execution including JIT compilation took 0.3029160499572754s.
-[triton-dejavu] First execution including JIT compilation took 0.27234864234924316s.
-[triton-dejavu] First execution including JIT compilation took 0.3524813652038574s.
-[triton-dejavu] First execution including JIT compilation took 0.2637143135070801s.
-[triton-dejavu] First execution including JIT compilation took 0.21795105934143066s.
-[triton-dejavu] First execution including JIT compilation took 0.36962461471557617s.
-[triton-dejavu] First execution including JIT compilation took 0.2753579616546631s.
-[triton-dejavu] First execution including JIT compilation took 0.24502253532409668s.
-[triton-dejavu] First execution including JIT compilation took 0.38353514671325684s.
-[triton-dejavu] First execution including JIT compilation took 0.25853633880615234s.
-[triton-dejavu] First execution including JIT compilation took 0.23975038528442383s.
-[triton-dejavu] First execution including JIT compilation took 0.0030221939086914062s.
-[triton-dejavu] First execution including JIT compilation took 0.29683613777160645s.
-[triton-dejavu] First execution including JIT compilation took 0.2580904960632324s.
-[triton-dejavu] First execution including JIT compilation took 0.4290771484375s.
-[triton-dejavu] First execution including JIT compilation took 0.3167991638183594s.
-[triton-dejavu] First execution including JIT compilation took 0.2567250728607178s.
-[triton-dejavu] First execution including JIT compilation took 0.44550418853759766s.
-[triton-dejavu] First execution including JIT compilation took 0.3198390007019043s.
-[triton-dejavu] First execution including JIT compilation took 0.268108606338501s.
-[triton-dejavu] First execution including JIT compilation took 0.4916553497314453s.
-[triton-dejavu] First execution including JIT compilation took 0.3439137935638428s.
-[triton-dejavu] First execution including JIT compilation took 0.27727365493774414s.
-[triton-dejavu] First execution including JIT compilation took 0.460857629776001s.
-[triton-dejavu] First execution including JIT compilation took 0.30243563652038574s.
-[triton-dejavu] First execution including JIT compilation took 0.24333858489990234s.
-[triton-dejavu] First execution including JIT compilation took 0.46892428398132324s.
-[triton-dejavu] First execution including JIT compilation took 0.3167304992675781s.
-[triton-dejavu] First execution including JIT compilation took 0.2599649429321289s.
-[triton-dejavu] First execution including JIT compilation took 0.5126926898956299s.
-[triton-dejavu] First execution including JIT compilation took 0.32805609703063965s.
-[triton-dejavu] First execution including JIT compilation took 0.26161670684814453s.
-[triton-dejavu] First execution including JIT compilation took 0.5467493534088135s.
-[triton-dejavu] First execution including JIT compilation took 0.3979170322418213s.
-[triton-dejavu] First execution including JIT compilation took 0.27261829376220703s.
-[triton-dejavu] First execution including JIT compilation took 0.56540846824646s.
-[triton-dejavu] First execution including JIT compilation took 0.35355091094970703s.
-[triton-dejavu] First execution including JIT compilation took 0.276700496673584s.
-[triton-dejavu] First execution including JIT compilation took 0.5869178771972656s.
-[triton-dejavu] First execution including JIT compilation took 0.3624422550201416s.
-[triton-dejavu] First execution including JIT compilation took 0.35153841972351074s.
-[triton-dejavu] First execution including JIT compilation took 0.6571488380432129s.
-[triton-dejavu] First execution including JIT compilation took 0.3958284854888916s.
-[triton-dejavu] First execution including JIT compilation took 0.30527758598327637s.
-[triton-dejavu] First execution including JIT compilation took 0.6626615524291992s.
-[triton-dejavu] First execution including JIT compilation took 0.3544487953186035s.
-[triton-dejavu] First execution including JIT compilation took 0.2698044776916504s.
-[triton-dejavu] First execution including JIT compilation took 0.6961638927459717s.
-[triton-dejavu] First execution including JIT compilation took 0.38259434700012207s.
-[triton-dejavu] First execution including JIT compilation took 0.283905029296875s.
-[triton-dejavu] First execution including JIT compilation took 0.845867395401001s.
-[triton-dejavu] First execution including JIT compilation took 0.4127688407897949s.
-[triton-dejavu] First execution including JIT compilation took 0.3159315586090088s.
-[triton-dejavu] First execution including JIT compilation took 0.9087560176849365s.
-[triton-dejavu] First execution including JIT compilation took 0.4513425827026367s.
-[triton-dejavu] First execution including JIT compilation took 0.3294107913970947s.
-[triton-dejavu] First execution including JIT compilation took 0.9571695327758789s.
-[triton-dejavu] First execution including JIT compilation took 0.4684031009674072s.
-[triton-dejavu] First execution including JIT compilation took 0.32914233207702637s.
-[triton-dejavu] First execution including JIT compilation took 1.0359725952148438s.
-[triton-dejavu] First execution including JIT compilation took 0.48003387451171875s.
-[triton-dejavu] First execution including JIT compilation took 0.34207773208618164s.
-[triton-dejavu] First execution including JIT compilation took 1.1305363178253174s.
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.2873709201812744s.
-[triton-dejavu] First execution including JIT compilation took 0.534212589263916s.
-[triton-dejavu] First execution including JIT compilation took 0.34093737602233887s.
-[triton-dejavu] First execution including JIT compilation took 1.2213225364685059s.
-[triton-dejavu] First execution including JIT compilation took 0.5500822067260742s.
-[triton-dejavu] First execution including JIT compilation took 0.3482015132904053s.
-[triton-dejavu] First execution including JIT compilation took 2.321138620376587s.
-[triton-dejavu] First execution including JIT compilation took 0.5398764610290527s.
-[triton-dejavu] First execution including JIT compilation took 0.3589463233947754s.
-[triton-dejavu] First execution including JIT compilation took 2.2305822372436523s.
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.4210233688354492s.
-[triton-dejavu] First execution including JIT compilation took 0.2671318054199219s.
-[triton-dejavu] First execution including JIT compilation took 0.20480823516845703s.
-[triton-dejavu] First execution including JIT compilation took 0.36168575286865234s.
-[triton-dejavu] First execution including JIT compilation took 0.2831258773803711s.
-[triton-dejavu] First execution including JIT compilation took 0.22981572151184082s.
-[triton-dejavu] First execution including JIT compilation took 0.3903160095214844s.
-[triton-dejavu] First execution including JIT compilation took 0.2804446220397949s.
-[triton-dejavu] First execution including JIT compilation took 0.22222447395324707s.
-[triton-dejavu] First execution including JIT compilation took 0.3762855529785156s.
-[triton-dejavu] First execution including JIT compilation took 0.2824244499206543s.
-[triton-dejavu] First execution including JIT compilation took 0.22802186012268066s.
-[triton-dejavu] First execution including JIT compilation took 0.4042317867279053s.
-[triton-dejavu] First execution including JIT compilation took 0.2850303649902344s.
-[triton-dejavu] First execution including JIT compilation took 0.22367358207702637s.
-[triton-dejavu] First execution including JIT compilation took 0.45253777503967285s.
-[triton-dejavu] First execution including JIT compilation took 0.3078906536102295s.
-[triton-dejavu] First execution including JIT compilation took 0.23833608627319336s.
-[triton-dejavu] First execution including JIT compilation took 0.47820162773132324s.
-[triton-dejavu] First execution including JIT compilation took 0.332599401473999s.
-[triton-dejavu] First execution including JIT compilation took 0.256058931350708s.
-[triton-dejavu] First execution including JIT compilation took 0.4336233139038086s.
-[triton-dejavu] First execution including JIT compilation took 0.2906990051269531s.
-[triton-dejavu] First execution including JIT compilation took 0.22593021392822266s.
-[triton-dejavu] First execution including JIT compilation took 0.43659496307373047s.
-[triton-dejavu] First execution including JIT compilation took 0.295365571975708s.
-[triton-dejavu] First execution including JIT compilation took 0.3340928554534912s.
-[triton-dejavu] First execution including JIT compilation took 0.4568207263946533s.
-[triton-dejavu] First execution including JIT compilation took 0.34474825859069824s.
-[triton-dejavu] First execution including JIT compilation took 0.2425243854522705s.
-[triton-dejavu] First execution including JIT compilation took 0.48821306228637695s.
-[triton-dejavu] First execution including JIT compilation took 0.0030794143676757812s.
-[triton-dejavu] First execution including JIT compilation took 0.666248083114624s.
-[triton-dejavu] First execution including JIT compilation took 0.9645810127258301s.
-[triton-dejavu] First execution including JIT compilation took 0.43552494049072266s.
-[triton-dejavu] First execution including JIT compilation took 0.3096005916595459s.
-[triton-dejavu] First execution including JIT compilation took 0.6565234661102295s.
-[triton-dejavu] First execution including JIT compilation took 0.49860286712646484s.
-[triton-dejavu] First execution including JIT compilation took 0.31050992012023926s.
-[triton-dejavu] First execution including JIT compilation took 0.696462869644165s.
-[triton-dejavu] First execution including JIT compilation took 0.4284684658050537s.
-[triton-dejavu] First execution including JIT compilation took 0.3179745674133301s.
-[triton-dejavu] First execution including JIT compilation took 0.6832358837127686s.
-[triton-dejavu] First execution including JIT compilation took 0.4428989887237549s.
-[triton-dejavu] First execution including JIT compilation took 0.27704954147338867s.
-[triton-dejavu] First execution including JIT compilation took 0.7220911979675293s.
-[triton-dejavu] First execution including JIT compilation took 0.4185624122619629s.
-[triton-dejavu] First execution including JIT compilation took 0.2997853755950928s.
-[triton-dejavu] First execution including JIT compilation took 0.769294023513794s.
-[triton-dejavu] First execution including JIT compilation took 0.44492197036743164s.
-[triton-dejavu] First execution including JIT compilation took 0.3919029235839844s.
-[triton-dejavu] First execution including JIT compilation took 0.8174152374267578s.
-[triton-dejavu] First execution including JIT compilation took 0.4800558090209961s.
-[triton-dejavu] First execution including JIT compilation took 0.3278632164001465s.
-[triton-dejavu] First execution including JIT compilation took 0.8820762634277344s.
-[triton-dejavu] First execution including JIT compilation took 0.4979724884033203s.
-[triton-dejavu] First execution including JIT compilation took 0.3491017818450928s.
-[triton-dejavu] First execution including JIT compilation took 0.9607341289520264s.
-[triton-dejavu] First execution including JIT compilation took 0.5307338237762451s.
-[triton-dejavu] First execution including JIT compilation took 0.3707716464996338s.
-[triton-dejavu] First execution including JIT compilation took 1.0402915477752686s.
-[triton-dejavu] First execution including JIT compilation took 0.5747923851013184s.
-[triton-dejavu] First execution including JIT compilation took 0.3863534927368164s.
-[triton-dejavu] First execution including JIT compilation took 1.1301944255828857s.
-[triton-dejavu] First execution including JIT compilation took 0.5652072429656982s.
-[triton-dejavu] First execution including JIT compilation took 0.34700870513916016s.
-[triton-dejavu] First execution including JIT compilation took 1.0828943252563477s.
-[triton-dejavu] First execution including JIT compilation took 0.5150623321533203s.
-[triton-dejavu] First execution including JIT compilation took 0.2979111671447754s.
-[triton-dejavu] First execution including JIT compilation took 1.529569149017334s.
-[triton-dejavu] First execution including JIT compilation took 0.6551094055175781s.
-[triton-dejavu] First execution including JIT compilation took 0.38155317306518555s.
-[triton-dejavu] First execution including JIT compilation took 1.766725778579712s.
-[triton-dejavu] First execution including JIT compilation took 0.6843061447143555s.
-[triton-dejavu] First execution including JIT compilation took 0.4045257568359375s.
-[triton-dejavu] First execution including JIT compilation took 1.8947250843048096s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.4253039360046387s.
-[triton-dejavu] First execution including JIT compilation took 1.1816015243530273s.
-[triton-dejavu] First execution including JIT compilation took 0.4922316074371338s.
-[triton-dejavu] First execution including JIT compilation took 2.298893451690674s.
-[triton-dejavu] First execution including JIT compilation took 1.2072784900665283s.
-[triton-dejavu] First execution including JIT compilation took 0.5224888324737549s.
-[triton-dejavu] First execution including JIT compilation took 6.951720952987671s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.6793637275695801s.
-[triton-dejavu] First execution including JIT compilation took 0.6418313980102539s.
-[triton-dejavu] First execution including JIT compilation took 0.32692575454711914s.
-[triton-dejavu] First execution including JIT compilation took 0.8132402896881104s.
-[triton-dejavu] First execution including JIT compilation took 0.5734570026397705s.
-[triton-dejavu] First execution including JIT compilation took 0.35916733741760254s.
-[triton-dejavu] First execution including JIT compilation took 0.8288097381591797s.
-[triton-dejavu] First execution including JIT compilation took 0.5815913677215576s.
-[triton-dejavu] First execution including JIT compilation took 0.39780449867248535s.
-[triton-dejavu] First execution including JIT compilation took 0.8627204895019531s.
-[triton-dejavu] First execution including JIT compilation took 0.5819535255432129s.
-[triton-dejavu] First execution including JIT compilation took 0.3681964874267578s.
-[triton-dejavu] First execution including JIT compilation took 0.8420388698577881s.
-[triton-dejavu] First execution including JIT compilation took 0.5943279266357422s.
-[triton-dejavu] First execution including JIT compilation took 0.36092662811279297s.
-[triton-dejavu] First execution including JIT compilation took 0.8624413013458252s.
-[triton-dejavu] First execution including JIT compilation took 0.5882468223571777s.
-[triton-dejavu] First execution including JIT compilation took 0.3868570327758789s.
-[triton-dejavu] First execution including JIT compilation took 0.9039130210876465s.
-[triton-dejavu] First execution including JIT compilation took 0.6410880088806152s.
-[triton-dejavu] First execution including JIT compilation took 0.3988831043243408s.
-[triton-dejavu] First execution including JIT compilation took 0.9115607738494873s.
-[triton-dejavu] First execution including JIT compilation took 0.5902762413024902s.
-[triton-dejavu] First execution including JIT compilation took 0.34618401527404785s.
-[triton-dejavu] First execution including JIT compilation took 0.9532392024993896s.
-[triton-dejavu] First execution including JIT compilation took 0.635444164276123s.
-[triton-dejavu] First execution including JIT compilation took 0.3781321048736572s.
-[triton-dejavu] First execution including JIT compilation took 1.0092928409576416s.
-[triton-dejavu] First execution including JIT compilation took 0.6709246635437012s.
-[triton-dejavu] First execution including JIT compilation took 0.38914012908935547s.
-[triton-dejavu] First execution including JIT compilation took 1.0928781032562256s.
-[triton-dejavu] First execution including JIT compilation took 0.003269672393798828s.
-[triton-dejavu] First execution including JIT compilation took 0.3971376419067383s.
-[triton-dejavu] First execution including JIT compilation took 1.0150482654571533s.
-[triton-dejavu] First execution including JIT compilation took 0.6963634490966797s.
-[triton-dejavu] First execution including JIT compilation took 0.40409111976623535s.
-[triton-dejavu] First execution including JIT compilation took 1.1118721961975098s.
-[triton-dejavu] First execution including JIT compilation took 0.6946852207183838s.
-[triton-dejavu] First execution including JIT compilation took 0.4175405502319336s.
-[triton-dejavu] First execution including JIT compilation took 1.177678108215332s.
-[triton-dejavu] First execution including JIT compilation took 0.7395210266113281s.
-[triton-dejavu] First execution including JIT compilation took 0.4114506244659424s.
-[triton-dejavu] First execution including JIT compilation took 1.3139593601226807s.
-[triton-dejavu] First execution including JIT compilation took 0.7956829071044922s.
-[triton-dejavu] First execution including JIT compilation took 0.39879727363586426s.
-[triton-dejavu] First execution including JIT compilation took 1.328862190246582s.
-[triton-dejavu] First execution including JIT compilation took 0.6896190643310547s.
-[triton-dejavu] First execution including JIT compilation took 0.33518552780151367s.
-[triton-dejavu] First execution including JIT compilation took 1.5963466167449951s.
-[triton-dejavu] First execution including JIT compilation took 0.7468023300170898s.
-[triton-dejavu] First execution including JIT compilation took 0.4028303623199463s.
-[triton-dejavu] First execution including JIT compilation took 1.7192442417144775s.
-[triton-dejavu] First execution including JIT compilation took 0.7985246181488037s.
-[triton-dejavu] First execution including JIT compilation took 0.37429141998291016s.
-[triton-dejavu] First execution including JIT compilation took 1.7711453437805176s.
-[triton-dejavu] First execution including JIT compilation took 0.9907217025756836s.
-[triton-dejavu] First execution including JIT compilation took 0.49533724784851074s.
-[triton-dejavu] First execution including JIT compilation took 2.3483541011810303s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.211524486541748s.
-[triton-dejavu] First execution including JIT compilation took 1.2513744831085205s.
-[triton-dejavu] First execution including JIT compilation took 0.4271657466888428s.
-[triton-dejavu] First execution including JIT compilation took 2.204448938369751s.
-[triton-dejavu] First execution including JIT compilation took 1.185486078262329s.
-[triton-dejavu] First execution including JIT compilation took 0.5369844436645508s.
-[triton-dejavu] First execution including JIT compilation took 6.1959922313690186s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.166867017745972s.
-[triton-dejavu] First execution including JIT compilation took 3.8762850761413574s.
-[triton-dejavu] First execution including JIT compilation took 0.7830004692077637s.
-[triton-dejavu] First execution including JIT compilation took 6.120173931121826s.
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1146880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1146880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.33310794830322266s.
-[triton-dejavu] First execution including JIT compilation took 0.23391294479370117s.
-[triton-dejavu] First execution including JIT compilation took 0.2084214687347412s.
-[triton-dejavu] First execution including JIT compilation took 0.3005564212799072s.
-[triton-dejavu] First execution including JIT compilation took 0.2554941177368164s.
-[triton-dejavu] First execution including JIT compilation took 0.21907782554626465s.
-[triton-dejavu] First execution including JIT compilation took 0.31569480895996094s.
-[triton-dejavu] First execution including JIT compilation took 0.2694690227508545s.
-[triton-dejavu] First execution including JIT compilation took 0.22438645362854004s.
-[triton-dejavu] First execution including JIT compilation took 0.3229238986968994s.
-[triton-dejavu] First execution including JIT compilation took 0.2797393798828125s.
-[triton-dejavu] First execution including JIT compilation took 0.23162508010864258s.
-[triton-dejavu] First execution including JIT compilation took 0.3365445137023926s.
-[triton-dejavu] First execution including JIT compilation took 0.2754044532775879s.
-[triton-dejavu] First execution including JIT compilation took 0.22548437118530273s.
-[triton-dejavu] First execution including JIT compilation took 0.3373396396636963s.
-[triton-dejavu] First execution including JIT compilation took 0.2857697010040283s.
-[triton-dejavu] First execution including JIT compilation took 0.2294597625732422s.
-[triton-dejavu] First execution including JIT compilation took 0.3634176254272461s.
-[triton-dejavu] First execution including JIT compilation took 0.294708251953125s.
-[triton-dejavu] First execution including JIT compilation took 0.24236321449279785s.
-[triton-dejavu] First execution including JIT compilation took 0.3028702735900879s.
-[triton-dejavu] First execution including JIT compilation took 0.2470991611480713s.
-[triton-dejavu] First execution including JIT compilation took 0.21360516548156738s.
-[triton-dejavu] First execution including JIT compilation took 0.3189256191253662s.
-[triton-dejavu] First execution including JIT compilation took 0.25740885734558105s.
-[triton-dejavu] First execution including JIT compilation took 0.23542547225952148s.
-[triton-dejavu] First execution including JIT compilation took 0.34380078315734863s.
-[triton-dejavu] First execution including JIT compilation took 0.2774670124053955s.
-[triton-dejavu] First execution including JIT compilation took 0.24950265884399414s.
-[triton-dejavu] First execution including JIT compilation took 0.4161198139190674s.
-[triton-dejavu] First execution including JIT compilation took 0.28986072540283203s.
-[triton-dejavu] First execution including JIT compilation took 0.2589759826660156s.
-[triton-dejavu] First execution including JIT compilation took 0.41210365295410156s.
-[triton-dejavu] First execution including JIT compilation took 0.32729268074035645s.
-[triton-dejavu] First execution including JIT compilation took 0.25850629806518555s.
-[triton-dejavu] First execution including JIT compilation took 0.4299044609069824s.
-[triton-dejavu] First execution including JIT compilation took 0.3116121292114258s.
-[triton-dejavu] First execution including JIT compilation took 0.27123379707336426s.
-[triton-dejavu] First execution including JIT compilation took 0.45281362533569336s.
-[triton-dejavu] First execution including JIT compilation took 0.3351759910583496s.
-[triton-dejavu] First execution including JIT compilation took 0.2787160873413086s.
-[triton-dejavu] First execution including JIT compilation took 0.41561436653137207s.
-[triton-dejavu] First execution including JIT compilation took 0.27190589904785156s.
-[triton-dejavu] First execution including JIT compilation took 0.2324838638305664s.
-[triton-dejavu] First execution including JIT compilation took 0.4087650775909424s.
-[triton-dejavu] First execution including JIT compilation took 0.28690099716186523s.
-[triton-dejavu] First execution including JIT compilation took 0.24116730690002441s.
-[triton-dejavu] First execution including JIT compilation took 0.5066123008728027s.
-[triton-dejavu] First execution including JIT compilation took 0.3034372329711914s.
-[triton-dejavu] First execution including JIT compilation took 0.25580596923828125s.
-[triton-dejavu] First execution including JIT compilation took 0.525223970413208s.
-[triton-dejavu] First execution including JIT compilation took 0.33296680450439453s.
-[triton-dejavu] First execution including JIT compilation took 0.27128124237060547s.
-[triton-dejavu] First execution including JIT compilation took 0.5657172203063965s.
-[triton-dejavu] First execution including JIT compilation took 0.3399391174316406s.
-[triton-dejavu] First execution including JIT compilation took 0.28380680084228516s.
-[triton-dejavu] First execution including JIT compilation took 0.6111602783203125s.
-[triton-dejavu] First execution including JIT compilation took 0.36371636390686035s.
-[triton-dejavu] First execution including JIT compilation took 0.3011593818664551s.
-[triton-dejavu] First execution including JIT compilation took 0.7230055332183838s.
-[triton-dejavu] First execution including JIT compilation took 0.4232914447784424s.
-[triton-dejavu] First execution including JIT compilation took 0.31528306007385254s.
-[triton-dejavu] First execution including JIT compilation took 0.6461219787597656s.
-[triton-dejavu] First execution including JIT compilation took 0.36070823669433594s.
-[triton-dejavu] First execution including JIT compilation took 0.2686340808868408s.
-[triton-dejavu] First execution including JIT compilation took 0.6663899421691895s.
-[triton-dejavu] First execution including JIT compilation took 0.3726685047149658s.
-[triton-dejavu] First execution including JIT compilation took 0.2806117534637451s.
-[triton-dejavu] First execution including JIT compilation took 1.2110939025878906s.
-[triton-dejavu] First execution including JIT compilation took 0.43669724464416504s.
-[triton-dejavu] First execution including JIT compilation took 0.29979729652404785s.
-[triton-dejavu] First execution including JIT compilation took 1.2734310626983643s.
-[triton-dejavu] First execution including JIT compilation took 0.4524815082550049s.
-[triton-dejavu] First execution including JIT compilation took 0.30893588066101074s.
-[triton-dejavu] First execution including JIT compilation took 1.3412039279937744s.
-[triton-dejavu] First execution including JIT compilation took 0.4808011054992676s.
-[triton-dejavu] First execution including JIT compilation took 0.32793712615966797s.
-[triton-dejavu] First execution including JIT compilation took 1.3745002746582031s.
-[triton-dejavu] First execution including JIT compilation took 0.4310414791107178s.
-[triton-dejavu] First execution including JIT compilation took 0.2714354991912842s.
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.2250773906707764s.
-[triton-dejavu] First execution including JIT compilation took 0.4646158218383789s.
-[triton-dejavu] First execution including JIT compilation took 0.2895984649658203s.
-[triton-dejavu] First execution including JIT compilation took 1.310636043548584s.
-[triton-dejavu] First execution including JIT compilation took 0.5870482921600342s.
-[triton-dejavu] First execution including JIT compilation took 0.36336755752563477s.
-[triton-dejavu] First execution including JIT compilation took 5.4522223472595215s.
-[triton-dejavu] First execution including JIT compilation took 0.9788007736206055s.
-[triton-dejavu] First execution including JIT compilation took 0.3662402629852295s.
-[triton-dejavu] First execution including JIT compilation took 5.491236209869385s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.3520832061767578s.
-[triton-dejavu] First execution including JIT compilation took 0.3299834728240967s.
-[triton-dejavu] First execution including JIT compilation took 0.21879220008850098s.
-[triton-dejavu] First execution including JIT compilation took 0.3383035659790039s.
-[triton-dejavu] First execution including JIT compilation took 0.2801651954650879s.
-[triton-dejavu] First execution including JIT compilation took 0.23688268661499023s.
-[triton-dejavu] First execution including JIT compilation took 0.3647580146789551s.
-[triton-dejavu] First execution including JIT compilation took 0.2947418689727783s.
-[triton-dejavu] First execution including JIT compilation took 0.27162957191467285s.
-[triton-dejavu] First execution including JIT compilation took 0.38416576385498047s.
-[triton-dejavu] First execution including JIT compilation took 0.30350494384765625s.
-[triton-dejavu] First execution including JIT compilation took 0.2574441432952881s.
-[triton-dejavu] First execution including JIT compilation took 0.4012939929962158s.
-[triton-dejavu] First execution including JIT compilation took 0.31052398681640625s.
-[triton-dejavu] First execution including JIT compilation took 0.26515769958496094s.
-[triton-dejavu] First execution including JIT compilation took 0.39237308502197266s.
-[triton-dejavu] First execution including JIT compilation took 0.3096015453338623s.
-[triton-dejavu] First execution including JIT compilation took 0.262850284576416s.
-[triton-dejavu] First execution including JIT compilation took 0.4091818332672119s.
-[triton-dejavu] First execution including JIT compilation took 0.321216344833374s.
-[triton-dejavu] First execution including JIT compilation took 0.2732977867126465s.
-[triton-dejavu] First execution including JIT compilation took 0.4301795959472656s.
-[triton-dejavu] First execution including JIT compilation took 0.2619006633758545s.
-[triton-dejavu] First execution including JIT compilation took 0.22098731994628906s.
-[triton-dejavu] First execution including JIT compilation took 0.37630200386047363s.
-[triton-dejavu] First execution including JIT compilation took 0.34072089195251465s.
-[triton-dejavu] First execution including JIT compilation took 0.23756647109985352s.
-[triton-dejavu] First execution including JIT compilation took 0.41828155517578125s.
-[triton-dejavu] First execution including JIT compilation took 0.30147528648376465s.
-[triton-dejavu] First execution including JIT compilation took 0.2543652057647705s.
-[triton-dejavu] First execution including JIT compilation took 0.43347787857055664s.
-[triton-dejavu] First execution including JIT compilation took 0.0028808116912841797s.
-[triton-dejavu] First execution including JIT compilation took 0.2641146183013916s.
-[triton-dejavu] First execution including JIT compilation took 0.530811071395874s.
-[triton-dejavu] First execution including JIT compilation took 0.3217940330505371s.
-[triton-dejavu] First execution including JIT compilation took 0.28223276138305664s.
-[triton-dejavu] First execution including JIT compilation took 0.47287917137145996s.
-[triton-dejavu] First execution including JIT compilation took 0.3476870059967041s.
-[triton-dejavu] First execution including JIT compilation took 0.28547072410583496s.
-[triton-dejavu] First execution including JIT compilation took 0.524724006652832s.
-[triton-dejavu] First execution including JIT compilation took 0.36275696754455566s.
-[triton-dejavu] First execution including JIT compilation took 0.2947351932525635s.
-[triton-dejavu] First execution including JIT compilation took 0.4834451675415039s.
-[triton-dejavu] First execution including JIT compilation took 0.31950998306274414s.
-[triton-dejavu] First execution including JIT compilation took 0.2404794692993164s.
-[triton-dejavu] First execution including JIT compilation took 0.5020296573638916s.
-[triton-dejavu] First execution including JIT compilation took 0.32535886764526367s.
-[triton-dejavu] First execution including JIT compilation took 0.2655165195465088s.
-[triton-dejavu] First execution including JIT compilation took 0.5749058723449707s.
-[triton-dejavu] First execution including JIT compilation took 0.35364580154418945s.
-[triton-dejavu] First execution including JIT compilation took 0.26879262924194336s.
-[triton-dejavu] First execution including JIT compilation took 0.5956721305847168s.
-[triton-dejavu] First execution including JIT compilation took 0.3535337448120117s.
-[triton-dejavu] First execution including JIT compilation took 0.28673887252807617s.
-[triton-dejavu] First execution including JIT compilation took 0.6367616653442383s.
-[triton-dejavu] First execution including JIT compilation took 0.3775303363800049s.
-[triton-dejavu] First execution including JIT compilation took 0.3009200096130371s.
-[triton-dejavu] First execution including JIT compilation took 0.6813859939575195s.
-[triton-dejavu] First execution including JIT compilation took 0.4030306339263916s.
-[triton-dejavu] First execution including JIT compilation took 0.29657673835754395s.
-[triton-dejavu] First execution including JIT compilation took 0.7835328578948975s.
-[triton-dejavu] First execution including JIT compilation took 0.4300117492675781s.
-[triton-dejavu] First execution including JIT compilation took 0.3349874019622803s.
-[triton-dejavu] First execution including JIT compilation took 0.7587988376617432s.
-[triton-dejavu] First execution including JIT compilation took 0.4517331123352051s.
-[triton-dejavu] First execution including JIT compilation took 0.28656530380249023s.
-[triton-dejavu] First execution including JIT compilation took 0.7893960475921631s.
-[triton-dejavu] First execution including JIT compilation took 0.4339447021484375s.
-[triton-dejavu] First execution including JIT compilation took 0.3180868625640869s.
-[triton-dejavu] First execution including JIT compilation took 1.3489327430725098s.
-[triton-dejavu] First execution including JIT compilation took 0.5434455871582031s.
-[triton-dejavu] First execution including JIT compilation took 0.35109591484069824s.
-[triton-dejavu] First execution including JIT compilation took 1.4147284030914307s.
-[triton-dejavu] First execution including JIT compilation took 0.5231330394744873s.
-[triton-dejavu] First execution including JIT compilation took 0.35427212715148926s.
-[triton-dejavu] First execution including JIT compilation took 1.4746348857879639s.
-[triton-dejavu] First execution including JIT compilation took 0.561915397644043s.
-[triton-dejavu] First execution including JIT compilation took 0.3480250835418701s.
-[triton-dejavu] First execution including JIT compilation took 1.5745019912719727s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.5983943939208984s.
-[triton-dejavu] First execution including JIT compilation took 0.6874723434448242s.
-[triton-dejavu] First execution including JIT compilation took 0.3813052177429199s.
-[triton-dejavu] First execution including JIT compilation took 1.7068426609039307s.
-[triton-dejavu] First execution including JIT compilation took 0.6691153049468994s.
-[triton-dejavu] First execution including JIT compilation took 0.39803266525268555s.
-[triton-dejavu] First execution including JIT compilation took 5.79765248298645s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.3679988384246826s.
-[triton-dejavu] First execution including JIT compilation took 0.2748281955718994s.
-[triton-dejavu] First execution including JIT compilation took 0.2133641242980957s.
-[triton-dejavu] First execution including JIT compilation took 0.42013049125671387s.
-[triton-dejavu] First execution including JIT compilation took 0.30323004722595215s.
-[triton-dejavu] First execution including JIT compilation took 0.2673945426940918s.
-[triton-dejavu] First execution including JIT compilation took 0.49060797691345215s.
-[triton-dejavu] First execution including JIT compilation took 0.37160611152648926s.
-[triton-dejavu] First execution including JIT compilation took 0.2765369415283203s.
-[triton-dejavu] First execution including JIT compilation took 0.5039165019989014s.
-[triton-dejavu] First execution including JIT compilation took 0.37934136390686035s.
-[triton-dejavu] First execution including JIT compilation took 0.2831428050994873s.
-[triton-dejavu] First execution including JIT compilation took 0.5394682884216309s.
-[triton-dejavu] First execution including JIT compilation took 0.37555360794067383s.
-[triton-dejavu] First execution including JIT compilation took 0.2974073886871338s.
-[triton-dejavu] First execution including JIT compilation took 0.5276486873626709s.
-[triton-dejavu] First execution including JIT compilation took 0.39134836196899414s.
-[triton-dejavu] First execution including JIT compilation took 0.2950737476348877s.
-[triton-dejavu] First execution including JIT compilation took 0.5684738159179688s.
-[triton-dejavu] First execution including JIT compilation took 0.41124916076660156s.
-[triton-dejavu] First execution including JIT compilation took 0.3004477024078369s.
-[triton-dejavu] First execution including JIT compilation took 0.5164830684661865s.
-[triton-dejavu] First execution including JIT compilation took 0.33581042289733887s.
-[triton-dejavu] First execution including JIT compilation took 0.27167344093322754s.
-[triton-dejavu] First execution including JIT compilation took 0.5106401443481445s.
-[triton-dejavu] First execution including JIT compilation took 0.37090396881103516s.
-[triton-dejavu] First execution including JIT compilation took 0.2658994197845459s.
-[triton-dejavu] First execution including JIT compilation took 0.5844974517822266s.
-[triton-dejavu] First execution including JIT compilation took 0.3731074333190918s.
-[triton-dejavu] First execution including JIT compilation took 0.31909990310668945s.
-[triton-dejavu] First execution including JIT compilation took 0.5862879753112793s.
-[triton-dejavu] First execution including JIT compilation took 0.0029494762420654297s.
-[triton-dejavu] First execution including JIT compilation took 0.290740966796875s.
-[triton-dejavu] First execution including JIT compilation took 0.6013433933258057s.
-[triton-dejavu] First execution including JIT compilation took 0.4201853275299072s.
-[triton-dejavu] First execution including JIT compilation took 0.30014801025390625s.
-[triton-dejavu] First execution including JIT compilation took 0.6341700553894043s.
-[triton-dejavu] First execution including JIT compilation took 0.4125685691833496s.
-[triton-dejavu] First execution including JIT compilation took 0.3149580955505371s.
-[triton-dejavu] First execution including JIT compilation took 0.7038888931274414s.
-[triton-dejavu] First execution including JIT compilation took 0.44381022453308105s.
-[triton-dejavu] First execution including JIT compilation took 0.3345675468444824s.
-[triton-dejavu] First execution including JIT compilation took 0.7102112770080566s.
-[triton-dejavu] First execution including JIT compilation took 0.39132046699523926s.
-[triton-dejavu] First execution including JIT compilation took 0.2966330051422119s.
-[triton-dejavu] First execution including JIT compilation took 0.7261581420898438s.
-[triton-dejavu] First execution including JIT compilation took 0.42345237731933594s.
-[triton-dejavu] First execution including JIT compilation took 0.31378960609436035s.
-[triton-dejavu] First execution including JIT compilation took 0.7939469814300537s.
-[triton-dejavu] First execution including JIT compilation took 0.45282721519470215s.
-[triton-dejavu] First execution including JIT compilation took 0.3177626132965088s.
-[triton-dejavu] First execution including JIT compilation took 0.8336560726165771s.
-[triton-dejavu] First execution including JIT compilation took 0.35431385040283203s.
-[triton-dejavu] First execution including JIT compilation took 0.32625389099121094s.
-[triton-dejavu] First execution including JIT compilation took 0.768075704574585s.
-[triton-dejavu] First execution including JIT compilation took 0.3967933654785156s.
-[triton-dejavu] First execution including JIT compilation took 0.27690625190734863s.
-[triton-dejavu] First execution including JIT compilation took 0.9250342845916748s.
-[triton-dejavu] First execution including JIT compilation took 0.49423885345458984s.
-[triton-dejavu] First execution including JIT compilation took 0.34920620918273926s.
-[triton-dejavu] First execution including JIT compilation took 1.0775840282440186s.
-[triton-dejavu] First execution including JIT compilation took 0.5416042804718018s.
-[triton-dejavu] First execution including JIT compilation took 0.38259077072143555s.
-[triton-dejavu] First execution including JIT compilation took 1.1039273738861084s.
-[triton-dejavu] First execution including JIT compilation took 0.526303768157959s.
-[triton-dejavu] First execution including JIT compilation took 0.34534621238708496s.
-[triton-dejavu] First execution including JIT compilation took 1.1143405437469482s.
-[triton-dejavu] First execution including JIT compilation took 0.5508031845092773s.
-[triton-dejavu] First execution including JIT compilation took 0.37677478790283203s.
-[triton-dejavu] First execution including JIT compilation took 1.8315963745117188s.
-[triton-dejavu] First execution including JIT compilation took 0.6505274772644043s.
-[triton-dejavu] First execution including JIT compilation took 0.39488959312438965s.
-[triton-dejavu] First execution including JIT compilation took 1.9625489711761475s.
-[triton-dejavu] First execution including JIT compilation took 0.6776554584503174s.
-[triton-dejavu] First execution including JIT compilation took 0.41101694107055664s.
-[triton-dejavu] First execution including JIT compilation took 2.0118651390075684s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.172940969467163s.
-[triton-dejavu] First execution including JIT compilation took 0.9175102710723877s.
-[triton-dejavu] First execution including JIT compilation took 0.42366957664489746s.
-[triton-dejavu] First execution including JIT compilation took 2.0173258781433105s.
-[triton-dejavu] First execution including JIT compilation took 0.7885754108428955s.
-[triton-dejavu] First execution including JIT compilation took 0.44706130027770996s.
-[triton-dejavu] First execution including JIT compilation took 7.324063301086426s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.7107067108154297s.
-[triton-dejavu] First execution including JIT compilation took 0.40702342987060547s.
-[triton-dejavu] First execution including JIT compilation took 0.271730899810791s.
-[triton-dejavu] First execution including JIT compilation took 0.7312145233154297s.
-[triton-dejavu] First execution including JIT compilation took 0.4074242115020752s.
-[triton-dejavu] First execution including JIT compilation took 0.2868027687072754s.
-[triton-dejavu] First execution including JIT compilation took 0.7007474899291992s.
-[triton-dejavu] First execution including JIT compilation took 0.42080259323120117s.
-[triton-dejavu] First execution including JIT compilation took 0.27320408821105957s.
-[triton-dejavu] First execution including JIT compilation took 0.722841739654541s.
-[triton-dejavu] First execution including JIT compilation took 0.5550060272216797s.
-[triton-dejavu] First execution including JIT compilation took 0.32157206535339355s.
-[triton-dejavu] First execution including JIT compilation took 0.8072361946105957s.
-[triton-dejavu] First execution including JIT compilation took 0.43352651596069336s.
-[triton-dejavu] First execution including JIT compilation took 0.2982165813446045s.
-[triton-dejavu] First execution including JIT compilation took 0.7527244091033936s.
-[triton-dejavu] First execution including JIT compilation took 0.4649670124053955s.
-[triton-dejavu] First execution including JIT compilation took 0.3391098976135254s.
-[triton-dejavu] First execution including JIT compilation took 0.936931848526001s.
-[triton-dejavu] First execution including JIT compilation took 0.46184659004211426s.
-[triton-dejavu] First execution including JIT compilation took 0.2983987331390381s.
-[triton-dejavu] First execution including JIT compilation took 0.7631199359893799s.
-[triton-dejavu] First execution including JIT compilation took 0.39908528327941895s.
-[triton-dejavu] First execution including JIT compilation took 0.32989048957824707s.
-[triton-dejavu] First execution including JIT compilation took 0.7596316337585449s.
-[triton-dejavu] First execution including JIT compilation took 0.43782997131347656s.
-[triton-dejavu] First execution including JIT compilation took 0.3047447204589844s.
-[triton-dejavu] First execution including JIT compilation took 0.8982362747192383s.
-[triton-dejavu] First execution including JIT compilation took 0.4925217628479004s.
-[triton-dejavu] First execution including JIT compilation took 0.3316771984100342s.
-[triton-dejavu] First execution including JIT compilation took 0.864621639251709s.
-[triton-dejavu] First execution including JIT compilation took 0.016417741775512695s.
-[triton-dejavu] First execution including JIT compilation took 0.3927609920501709s.
-[triton-dejavu] First execution including JIT compilation took 0.8940439224243164s.
-[triton-dejavu] First execution including JIT compilation took 0.4808948040008545s.
-[triton-dejavu] First execution including JIT compilation took 0.3320889472961426s.
-[triton-dejavu] First execution including JIT compilation took 0.9511239528656006s.
-[triton-dejavu] First execution including JIT compilation took 0.510263204574585s.
-[triton-dejavu] First execution including JIT compilation took 0.3106980323791504s.
-[triton-dejavu] First execution including JIT compilation took 0.9828391075134277s.
-[triton-dejavu] First execution including JIT compilation took 0.6096630096435547s.
-[triton-dejavu] First execution including JIT compilation took 0.34572386741638184s.
-[triton-dejavu] First execution including JIT compilation took 1.0500340461730957s.
-[triton-dejavu] First execution including JIT compilation took 0.4872100353240967s.
-[triton-dejavu] First execution including JIT compilation took 0.3133056163787842s.
-[triton-dejavu] First execution including JIT compilation took 1.0500223636627197s.
-[triton-dejavu] First execution including JIT compilation took 0.5330610275268555s.
-[triton-dejavu] First execution including JIT compilation took 0.3301053047180176s.
-[triton-dejavu] First execution including JIT compilation took 1.1683359146118164s.
-[triton-dejavu] First execution including JIT compilation took 0.5536503791809082s.
-[triton-dejavu] First execution including JIT compilation took 0.34630656242370605s.
-[triton-dejavu] First execution including JIT compilation took 1.6548552513122559s.
-[triton-dejavu] First execution including JIT compilation took 0.7423355579376221s.
-[triton-dejavu] First execution including JIT compilation took 0.4386255741119385s.
-[triton-dejavu] First execution including JIT compilation took 1.7830908298492432s.
-[triton-dejavu] First execution including JIT compilation took 0.7749922275543213s.
-[triton-dejavu] First execution including JIT compilation took 0.4658083915710449s.
-[triton-dejavu] First execution including JIT compilation took 1.905794382095337s.
-[triton-dejavu] First execution including JIT compilation took 0.816021203994751s.
-[triton-dejavu] First execution including JIT compilation took 0.4723987579345703s.
-[triton-dejavu] First execution including JIT compilation took 1.869170904159546s.
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.789454460144043s.
-[triton-dejavu] First execution including JIT compilation took 0.6909325122833252s.
-[triton-dejavu] First execution including JIT compilation took 0.4053471088409424s.
-[triton-dejavu] First execution including JIT compilation took 1.7492396831512451s.
-[triton-dejavu] First execution including JIT compilation took 0.7165470123291016s.
-[triton-dejavu] First execution including JIT compilation took 0.4185338020324707s.
-[triton-dejavu] First execution including JIT compilation took 2.8342366218566895s.
-[triton-dejavu] First execution including JIT compilation took 0.8270382881164551s.
-[triton-dejavu] First execution including JIT compilation took 0.4604911804199219s.
-[triton-dejavu] First execution including JIT compilation took 2.927734851837158s.
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.0586724281311035s.
-[triton-dejavu] First execution including JIT compilation took 1.5080604553222656s.
-[triton-dejavu] First execution including JIT compilation took 0.6953163146972656s.
-[triton-dejavu] First execution including JIT compilation took 4.226686000823975s.
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 917504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 917504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.3607323169708252s.
-[triton-dejavu] First execution including JIT compilation took 0.7990188598632812s.
-[triton-dejavu] First execution including JIT compilation took 0.39726877212524414s.
-[triton-dejavu] First execution including JIT compilation took 1.393247127532959s.
-[triton-dejavu] First execution including JIT compilation took 0.9832372665405273s.
-[triton-dejavu] First execution including JIT compilation took 0.4763679504394531s.
-[triton-dejavu] First execution including JIT compilation took 1.456979513168335s.
-[triton-dejavu] First execution including JIT compilation took 1.0040147304534912s.
-[triton-dejavu] First execution including JIT compilation took 0.4499683380126953s.
-[triton-dejavu] First execution including JIT compilation took 1.467405080795288s.
-[triton-dejavu] First execution including JIT compilation took 1.0723049640655518s.
-[triton-dejavu] First execution including JIT compilation took 0.49906277656555176s.
-[triton-dejavu] First execution including JIT compilation took 1.524533987045288s.
-[triton-dejavu] First execution including JIT compilation took 1.4248688220977783s.
-[triton-dejavu] First execution including JIT compilation took 0.6042609214782715s.
-[triton-dejavu] First execution including JIT compilation took 1.7416322231292725s.
-[triton-dejavu] First execution including JIT compilation took 1.0214593410491943s.
-[triton-dejavu] First execution including JIT compilation took 0.45897865295410156s.
-[triton-dejavu] First execution including JIT compilation took 1.5276007652282715s.
-[triton-dejavu] First execution including JIT compilation took 1.0185387134552002s.
-[triton-dejavu] First execution including JIT compilation took 0.5293161869049072s.
-[triton-dejavu] First execution including JIT compilation took 1.8517167568206787s.
-[triton-dejavu] First execution including JIT compilation took 0.9630119800567627s.
-[triton-dejavu] First execution including JIT compilation took 0.43575310707092285s.
-[triton-dejavu] First execution including JIT compilation took 1.9177396297454834s.
-[triton-dejavu] First execution including JIT compilation took 1.569082498550415s.
-[triton-dejavu] First execution including JIT compilation took 0.622168779373169s.
-[triton-dejavu] First execution including JIT compilation took 2.339301347732544s.
-[triton-dejavu] First execution including JIT compilation took 1.5994513034820557s.
-[triton-dejavu] First execution including JIT compilation took 0.6422829627990723s.
-[triton-dejavu] First execution including JIT compilation took 2.1358773708343506s.
-[triton-dejavu] First execution including JIT compilation took 1.1553890705108643s.
-[triton-dejavu] First execution including JIT compilation took 0.5729074478149414s.
-[triton-dejavu] First execution including JIT compilation took 1.8737192153930664s.
-[triton-dejavu] First execution including JIT compilation took 1.6270005702972412s.
-[triton-dejavu] First execution including JIT compilation took 0.5927095413208008s.
-[triton-dejavu] First execution including JIT compilation took 1.9137556552886963s.
-[triton-dejavu] First execution including JIT compilation took 1.6627833843231201s.
-[triton-dejavu] First execution including JIT compilation took 0.6282734870910645s.
-[triton-dejavu] First execution including JIT compilation took 2.6357598304748535s.
-[triton-dejavu] First execution including JIT compilation took 1.3591229915618896s.
-[triton-dejavu] First execution including JIT compilation took 0.6953067779541016s.
-[triton-dejavu] First execution including JIT compilation took 2.43611741065979s.
-[triton-dejavu] First execution including JIT compilation took 1.2323598861694336s.
-[triton-dejavu] First execution including JIT compilation took 0.6111257076263428s.
-[triton-dejavu] First execution including JIT compilation took 2.841799259185791s.
-[triton-dejavu] First execution including JIT compilation took 1.360656976699829s.
-[triton-dejavu] First execution including JIT compilation took 0.8137938976287842s.
-[triton-dejavu] First execution including JIT compilation took 3.458110809326172s.
-[triton-dejavu] First execution including JIT compilation took 1.5271718502044678s.
-[triton-dejavu] First execution including JIT compilation took 0.0032939910888671875s.
-[triton-dejavu] First execution including JIT compilation took 2.9182276725769043s.
-[triton-dejavu] First execution including JIT compilation took 1.539180040359497s.
-[triton-dejavu] First execution including JIT compilation took 0.6763615608215332s.
-[triton-dejavu] First execution including JIT compilation took 3.089775562286377s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.155709505081177s.
-[triton-dejavu] First execution including JIT compilation took 2.4426767826080322s.
-[triton-dejavu] First execution including JIT compilation took 1.0379819869995117s.
-[triton-dejavu] First execution including JIT compilation took 4.222529649734497s.
-[triton-dejavu] First execution including JIT compilation took 2.4925472736358643s.
-[triton-dejavu] First execution including JIT compilation took 1.073103666305542s.
-[triton-dejavu] First execution including JIT compilation took 8.762295961380005s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 7.50976037979126s.
-[triton-dejavu] First execution including JIT compilation took 3.327193260192871s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1376256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1376256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.3581666946411133s.
-[triton-dejavu] First execution including JIT compilation took 0.2740662097930908s.
-[triton-dejavu] First execution including JIT compilation took 0.23471784591674805s.
-[triton-dejavu] First execution including JIT compilation took 0.3980753421783447s.
-[triton-dejavu] First execution including JIT compilation took 0.28090405464172363s.
-[triton-dejavu] First execution including JIT compilation took 0.21105456352233887s.
-[triton-dejavu] First execution including JIT compilation took 0.41014552116394043s.
-[triton-dejavu] First execution including JIT compilation took 0.3041348457336426s.
-[triton-dejavu] First execution including JIT compilation took 0.2524149417877197s.
-[triton-dejavu] First execution including JIT compilation took 0.42508769035339355s.
-[triton-dejavu] First execution including JIT compilation took 0.3460569381713867s.
-[triton-dejavu] First execution including JIT compilation took 0.26442742347717285s.
-[triton-dejavu] First execution including JIT compilation took 0.46298742294311523s.
-[triton-dejavu] First execution including JIT compilation took 0.33917737007141113s.
-[triton-dejavu] First execution including JIT compilation took 0.2681269645690918s.
-[triton-dejavu] First execution including JIT compilation took 0.48372411727905273s.
-[triton-dejavu] First execution including JIT compilation took 0.34528517723083496s.
-[triton-dejavu] First execution including JIT compilation took 0.27705836296081543s.
-[triton-dejavu] First execution including JIT compilation took 0.5136411190032959s.
-[triton-dejavu] First execution including JIT compilation took 0.35900115966796875s.
-[triton-dejavu] First execution including JIT compilation took 0.27854084968566895s.
-[triton-dejavu] First execution including JIT compilation took 0.4344968795776367s.
-[triton-dejavu] First execution including JIT compilation took 0.29988908767700195s.
-[triton-dejavu] First execution including JIT compilation took 0.4305758476257324s.
-[triton-dejavu] First execution including JIT compilation took 0.4533987045288086s.
-[triton-dejavu] First execution including JIT compilation took 0.535660982131958s.
-[triton-dejavu] First execution including JIT compilation took 0.2640557289123535s.
-[triton-dejavu] First execution including JIT compilation took 0.7827637195587158s.
-[triton-dejavu] First execution including JIT compilation took 0.4749734401702881s.
-[triton-dejavu] First execution including JIT compilation took 0.28125476837158203s.
-[triton-dejavu] First execution including JIT compilation took 0.5667471885681152s.
-[triton-dejavu] First execution including JIT compilation took 0.3628854751586914s.
-[triton-dejavu] First execution including JIT compilation took 0.27803826332092285s.
-[triton-dejavu] First execution including JIT compilation took 0.5935788154602051s.
-[triton-dejavu] First execution including JIT compilation took 0.37410998344421387s.
-[triton-dejavu] First execution including JIT compilation took 0.315047025680542s.
-[triton-dejavu] First execution including JIT compilation took 0.6383876800537109s.
-[triton-dejavu] First execution including JIT compilation took 0.39231395721435547s.
-[triton-dejavu] First execution including JIT compilation took 0.4904477596282959s.
-[triton-dejavu] First execution including JIT compilation took 0.7176785469055176s.
-[triton-dejavu] First execution including JIT compilation took 0.8923492431640625s.
-[triton-dejavu] First execution including JIT compilation took 0.37270665168762207s.
-[triton-dejavu] First execution including JIT compilation took 0.7702887058258057s.
-[triton-dejavu] First execution including JIT compilation took 0.39134764671325684s.
-[triton-dejavu] First execution including JIT compilation took 0.27783751487731934s.
-[triton-dejavu] First execution including JIT compilation took 0.7178552150726318s.
-[triton-dejavu] First execution including JIT compilation took 0.5033924579620361s.
-[triton-dejavu] First execution including JIT compilation took 0.29184746742248535s.
-[triton-dejavu] First execution including JIT compilation took 1.4417552947998047s.
-[triton-dejavu] First execution including JIT compilation took 0.455214262008667s.
-[triton-dejavu] First execution including JIT compilation took 0.32988977432250977s.
-[triton-dejavu] First execution including JIT compilation took 1.2478272914886475s.
-[triton-dejavu] First execution including JIT compilation took 0.8535811901092529s.
-[triton-dejavu] First execution including JIT compilation took 0.4271533489227295s.
-[triton-dejavu] First execution including JIT compilation took 1.5763370990753174s.
-[triton-dejavu] First execution including JIT compilation took 0.6108431816101074s.
-[triton-dejavu] First execution including JIT compilation took 0.3532085418701172s.
-[triton-dejavu] First execution including JIT compilation took 1.6405696868896484s.
-[triton-dejavu] First execution including JIT compilation took 0.5536832809448242s.
-[triton-dejavu] First execution including JIT compilation took 0.36753392219543457s.
-bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.3316895961761475s.
-[triton-dejavu] First execution including JIT compilation took 0.578660249710083s.
-[triton-dejavu] First execution including JIT compilation took 0.3567483425140381s.
-[triton-dejavu] First execution including JIT compilation took 1.5676116943359375s.
-[triton-dejavu] First execution including JIT compilation took 0.5794088840484619s.
-[triton-dejavu] First execution including JIT compilation took 0.3735392093658447s.
-[triton-dejavu] First execution including JIT compilation took 5.502956390380859s.
-[triton-dejavu] First execution including JIT compilation took 1.0270774364471436s.
-[triton-dejavu] First execution including JIT compilation took 0.442889928817749s.
-[triton-dejavu] First execution including JIT compilation took 5.70585036277771s.
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 3.93192195892334s.
-[triton-dejavu] First execution including JIT compilation took 1.0750982761383057s.
-[triton-dejavu] First execution including JIT compilation took 0.5941033363342285s.
-[triton-dejavu] First execution including JIT compilation took 4.812488079071045s.
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 835584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 835584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 835584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 835584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1114112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1114112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1114112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1114112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.5393879413604736s.
-[triton-dejavu] First execution including JIT compilation took 0.36963605880737305s.
-[triton-dejavu] First execution including JIT compilation took 0.3970627784729004s.
-[triton-dejavu] First execution including JIT compilation took 0.4841430187225342s.
-[triton-dejavu] First execution including JIT compilation took 0.3199918270111084s.
-[triton-dejavu] First execution including JIT compilation took 0.2800755500793457s.
-[triton-dejavu] First execution including JIT compilation took 0.5644237995147705s.
-[triton-dejavu] First execution including JIT compilation took 0.3869204521179199s.
-[triton-dejavu] First execution including JIT compilation took 0.4037020206451416s.
-[triton-dejavu] First execution including JIT compilation took 0.5500500202178955s.
-[triton-dejavu] First execution including JIT compilation took 0.3945121765136719s.
-[triton-dejavu] First execution including JIT compilation took 0.3146946430206299s.
-[triton-dejavu] First execution including JIT compilation took 0.5734715461730957s.
-[triton-dejavu] First execution including JIT compilation took 0.5372509956359863s.
-[triton-dejavu] First execution including JIT compilation took 0.3640165328979492s.
-[triton-dejavu] First execution including JIT compilation took 0.6109771728515625s.
-[triton-dejavu] First execution including JIT compilation took 0.4634361267089844s.
-[triton-dejavu] First execution including JIT compilation took 0.4206717014312744s.
-[triton-dejavu] First execution including JIT compilation took 1.0486819744110107s.
-[triton-dejavu] First execution including JIT compilation took 0.44484424591064453s.
-[triton-dejavu] First execution including JIT compilation took 0.3491060733795166s.
-[triton-dejavu] First execution including JIT compilation took 0.7697179317474365s.
-[triton-dejavu] First execution including JIT compilation took 0.3961319923400879s.
-[triton-dejavu] First execution including JIT compilation took 0.3008708953857422s.
-[triton-dejavu] First execution including JIT compilation took 0.6616361141204834s.
-[triton-dejavu] First execution including JIT compilation took 0.45753026008605957s.
-[triton-dejavu] First execution including JIT compilation took 0.3097813129425049s.
-[triton-dejavu] First execution including JIT compilation took 0.7761518955230713s.
-[triton-dejavu] First execution including JIT compilation took 0.5004098415374756s.
-[triton-dejavu] First execution including JIT compilation took 0.3134744167327881s.
-[triton-dejavu] First execution including JIT compilation took 0.7714171409606934s.
-[triton-dejavu] First execution including JIT compilation took 0.7993361949920654s.
-[triton-dejavu] First execution including JIT compilation took 0.34277820587158203s.
-[triton-dejavu] First execution including JIT compilation took 0.808971643447876s.
-[triton-dejavu] First execution including JIT compilation took 0.4371776580810547s.
-[triton-dejavu] First execution including JIT compilation took 0.31221866607666016s.
-[triton-dejavu] First execution including JIT compilation took 0.6809587478637695s.
-[triton-dejavu] First execution including JIT compilation took 0.40524864196777344s.
-[triton-dejavu] First execution including JIT compilation took 0.49398159980773926s.
-[triton-dejavu] First execution including JIT compilation took 0.7367451190948486s.
-[triton-dejavu] First execution including JIT compilation took 0.7439749240875244s.
-[triton-dejavu] First execution including JIT compilation took 0.3696317672729492s.
-[triton-dejavu] First execution including JIT compilation took 1.1181640625s.
-[triton-dejavu] First execution including JIT compilation took 0.4313173294067383s.
-[triton-dejavu] First execution including JIT compilation took 0.297299861907959s.
-[triton-dejavu] First execution including JIT compilation took 0.8869140148162842s.
-[triton-dejavu] First execution including JIT compilation took 0.48682713508605957s.
-[triton-dejavu] First execution including JIT compilation took 0.3501567840576172s.
-[triton-dejavu] First execution including JIT compilation took 1.4581646919250488s.
-[triton-dejavu] First execution including JIT compilation took 0.5649135112762451s.
-[triton-dejavu] First execution including JIT compilation took 0.3721659183502197s.
-[triton-dejavu] First execution including JIT compilation took 1.5119690895080566s.
-[triton-dejavu] First execution including JIT compilation took 0.5899574756622314s.
-[triton-dejavu] First execution including JIT compilation took 0.3819904327392578s.
-[triton-dejavu] First execution including JIT compilation took 1.6209561824798584s.
-[triton-dejavu] First execution including JIT compilation took 0.6263985633850098s.
-[triton-dejavu] First execution including JIT compilation took 0.38887882232666016s.
-[triton-dejavu] First execution including JIT compilation took 1.7282218933105469s.
-[triton-dejavu] First execution including JIT compilation took 0.6377005577087402s.
-[triton-dejavu] First execution including JIT compilation took 0.4078361988067627s.
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.6659209728240967s.
-[triton-dejavu] First execution including JIT compilation took 0.9628505706787109s.
-[triton-dejavu] First execution including JIT compilation took 0.4381530284881592s.
-[triton-dejavu] First execution including JIT compilation took 1.6766464710235596s.
-[triton-dejavu] First execution including JIT compilation took 0.7337453365325928s.
-[triton-dejavu] First execution including JIT compilation took 0.673093318939209s.
-[triton-dejavu] First execution including JIT compilation took 7.029362678527832s.
-[triton-dejavu] First execution including JIT compilation took 1.219388484954834s.
-[triton-dejavu] First execution including JIT compilation took 0.8028266429901123s.
-[triton-dejavu] First execution including JIT compilation took 6.798900127410889s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.058229684829712s.
-[triton-dejavu] First execution including JIT compilation took 1.5925123691558838s.
-[triton-dejavu] First execution including JIT compilation took 0.6987450122833252s.
-[triton-dejavu] First execution including JIT compilation took 5.16088080406189s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 884736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 884736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 884736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 884736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.8672266006469727s.
-[triton-dejavu] First execution including JIT compilation took 0.4460761547088623s.
-[triton-dejavu] First execution including JIT compilation took 0.2941617965698242s.
-[triton-dejavu] First execution including JIT compilation took 0.7295150756835938s.
-[triton-dejavu] First execution including JIT compilation took 0.418994665145874s.
-[triton-dejavu] First execution including JIT compilation took 0.3657665252685547s.
-[triton-dejavu] First execution including JIT compilation took 0.8384854793548584s.
-[triton-dejavu] First execution including JIT compilation took 0.5150895118713379s.
-[triton-dejavu] First execution including JIT compilation took 0.3905613422393799s.
-[triton-dejavu] First execution including JIT compilation took 0.9925787448883057s.
-[triton-dejavu] First execution including JIT compilation took 0.5739874839782715s.
-[triton-dejavu] First execution including JIT compilation took 0.3871347904205322s.
-[triton-dejavu] First execution including JIT compilation took 1.0117156505584717s.
-[triton-dejavu] First execution including JIT compilation took 0.5855865478515625s.
-[triton-dejavu] First execution including JIT compilation took 0.3940417766571045s.
-[triton-dejavu] First execution including JIT compilation took 1.0509228706359863s.
-[triton-dejavu] First execution including JIT compilation took 0.6001262664794922s.
-[triton-dejavu] First execution including JIT compilation took 0.4003324508666992s.
-[triton-dejavu] First execution including JIT compilation took 1.0769095420837402s.
-[triton-dejavu] First execution including JIT compilation took 0.6213738918304443s.
-[triton-dejavu] First execution including JIT compilation took 0.4390685558319092s.
-[triton-dejavu] First execution including JIT compilation took 1.0989954471588135s.
-[triton-dejavu] First execution including JIT compilation took 0.538212776184082s.
-[triton-dejavu] First execution including JIT compilation took 0.34635400772094727s.
-[triton-dejavu] First execution including JIT compilation took 1.1051290035247803s.
-[triton-dejavu] First execution including JIT compilation took 0.6306774616241455s.
-[triton-dejavu] First execution including JIT compilation took 0.3778243064880371s.
-[triton-dejavu] First execution including JIT compilation took 1.2004315853118896s.
-[triton-dejavu] First execution including JIT compilation took 0.8137209415435791s.
-[triton-dejavu] First execution including JIT compilation took 0.38579225540161133s.
-[triton-dejavu] First execution including JIT compilation took 1.2420098781585693s.
-[triton-dejavu] First execution including JIT compilation took 0.6466991901397705s.
-[triton-dejavu] First execution including JIT compilation took 0.36069154739379883s.
-[triton-dejavu] First execution including JIT compilation took 1.1935985088348389s.
-[triton-dejavu] First execution including JIT compilation took 0.7577130794525146s.
-[triton-dejavu] First execution including JIT compilation took 0.4502859115600586s.
-[triton-dejavu] First execution including JIT compilation took 1.3447489738464355s.
-[triton-dejavu] First execution including JIT compilation took 0.9920356273651123s.
-[triton-dejavu] First execution including JIT compilation took 0.431868314743042s.
-[triton-dejavu] First execution including JIT compilation took 1.4089694023132324s.
-[triton-dejavu] First execution including JIT compilation took 0.6866104602813721s.
-[triton-dejavu] First execution including JIT compilation took 0.445110559463501s.
-[triton-dejavu] First execution including JIT compilation took 1.3942172527313232s.
-[triton-dejavu] First execution including JIT compilation took 0.5355641841888428s.
-[triton-dejavu] First execution including JIT compilation took 0.32257676124572754s.
-[triton-dejavu] First execution including JIT compilation took 1.2880756855010986s.
-[triton-dejavu] First execution including JIT compilation took 0.6331043243408203s.
-[triton-dejavu] First execution including JIT compilation took 0.3692941665649414s.
-[triton-dejavu] First execution including JIT compilation took 1.7392678260803223s.
-[triton-dejavu] First execution including JIT compilation took 0.649709939956665s.
-[triton-dejavu] First execution including JIT compilation took 0.38323354721069336s.
-[triton-dejavu] First execution including JIT compilation took 1.8752937316894531s.
-[triton-dejavu] First execution including JIT compilation took 0.724346399307251s.
-[triton-dejavu] First execution including JIT compilation took 0.37693119049072266s.
-[triton-dejavu] First execution including JIT compilation took 2.0627846717834473s.
-[triton-dejavu] First execution including JIT compilation took 0.7618973255157471s.
-[triton-dejavu] First execution including JIT compilation took 0.4468967914581299s.
-[triton-dejavu] First execution including JIT compilation took 2.021761178970337s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.558701276779175s.
-[triton-dejavu] First execution including JIT compilation took 0.8006973266601562s.
-[triton-dejavu] First execution including JIT compilation took 0.4271361827850342s.
-[triton-dejavu] First execution including JIT compilation took 2.6522200107574463s.
-[triton-dejavu] First execution including JIT compilation took 0.860508918762207s.
-[triton-dejavu] First execution including JIT compilation took 0.4831836223602295s.
-[triton-dejavu] First execution including JIT compilation took 7.42484712600708s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 6.010982990264893s.
-[triton-dejavu] First execution including JIT compilation took 1.7582054138183594s.
-[triton-dejavu] First execution including JIT compilation took 1.0528242588043213s.
-[triton-dejavu] First execution including JIT compilation took 6.84581995010376s.
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1146880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1146880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.7142207622528076s.
-[triton-dejavu] First execution including JIT compilation took 0.8741211891174316s.
-[triton-dejavu] First execution including JIT compilation took 0.44619178771972656s.
-[triton-dejavu] First execution including JIT compilation took 1.8718609809875488s.
-[triton-dejavu] First execution including JIT compilation took 0.9042544364929199s.
-[triton-dejavu] First execution including JIT compilation took 0.4581465721130371s.
-[triton-dejavu] First execution including JIT compilation took 2.1042685508728027s.
-[triton-dejavu] First execution including JIT compilation took 0.908367395401001s.
-[triton-dejavu] First execution including JIT compilation took 0.48277711868286133s.
-[triton-dejavu] First execution including JIT compilation took 1.7529594898223877s.
-[triton-dejavu] First execution including JIT compilation took 0.9210634231567383s.
-[triton-dejavu] First execution including JIT compilation took 0.5785129070281982s.
-[triton-dejavu] First execution including JIT compilation took 1.9719526767730713s.
-[triton-dejavu] First execution including JIT compilation took 0.926983118057251s.
-[triton-dejavu] First execution including JIT compilation took 0.47329115867614746s.
-[triton-dejavu] First execution including JIT compilation took 1.8675498962402344s.
-[triton-dejavu] First execution including JIT compilation took 0.8849301338195801s.
-[triton-dejavu] First execution including JIT compilation took 0.4898045063018799s.
-[triton-dejavu] First execution including JIT compilation took 1.819542407989502s.
-[triton-dejavu] First execution including JIT compilation took 0.981731653213501s.
-[triton-dejavu] First execution including JIT compilation took 0.5096790790557861s.
-[triton-dejavu] First execution including JIT compilation took 2.11425518989563s.
-[triton-dejavu] First execution including JIT compilation took 0.837721586227417s.
-[triton-dejavu] First execution including JIT compilation took 0.4882984161376953s.
-[triton-dejavu] First execution including JIT compilation took 2.053067922592163s.
-[triton-dejavu] First execution including JIT compilation took 0.897794246673584s.
-[triton-dejavu] First execution including JIT compilation took 0.4767446517944336s.
-[triton-dejavu] First execution including JIT compilation took 2.07883358001709s.
-[triton-dejavu] First execution including JIT compilation took 1.0238347053527832s.
-[triton-dejavu] First execution including JIT compilation took 0.6266560554504395s.
-[triton-dejavu] First execution including JIT compilation took 2.814924478530884s.
-[triton-dejavu] First execution including JIT compilation took 1.255967378616333s.
-[triton-dejavu] First execution including JIT compilation took 0.680903434753418s.
-[triton-dejavu] First execution including JIT compilation took 2.395393133163452s.
-[triton-dejavu] First execution including JIT compilation took 1.0010457038879395s.
-[triton-dejavu] First execution including JIT compilation took 0.6347818374633789s.
-[triton-dejavu] First execution including JIT compilation took 2.7960519790649414s.
-[triton-dejavu] First execution including JIT compilation took 1.0326149463653564s.
-[triton-dejavu] First execution including JIT compilation took 0.5450241565704346s.
-[triton-dejavu] First execution including JIT compilation took 2.445779800415039s.
-[triton-dejavu] First execution including JIT compilation took 1.0319764614105225s.
-[triton-dejavu] First execution including JIT compilation took 0.6632704734802246s.
-[triton-dejavu] First execution including JIT compilation took 2.80086088180542s.
-[triton-dejavu] First execution including JIT compilation took 1.1742348670959473s.
-[triton-dejavu] First execution including JIT compilation took 0.5098991394042969s.
-[triton-dejavu] First execution including JIT compilation took 2.790087938308716s.
-[triton-dejavu] First execution including JIT compilation took 1.1971583366394043s.
-[triton-dejavu] First execution including JIT compilation took 0.5753312110900879s.
-[triton-dejavu] First execution including JIT compilation took 3.8199825286865234s.
-[triton-dejavu] First execution including JIT compilation took 1.5596168041229248s.
-[triton-dejavu] First execution including JIT compilation took 0.7234528064727783s.
-[triton-dejavu] First execution including JIT compilation took 3.8001348972320557s.
-[triton-dejavu] First execution including JIT compilation took 1.3300747871398926s.
-[triton-dejavu] First execution including JIT compilation took 0.8064060211181641s.
-[triton-dejavu] First execution including JIT compilation took 3.833221673965454s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.838615655899048s.
-[triton-dejavu] First execution including JIT compilation took 1.5911104679107666s.
-[triton-dejavu] First execution including JIT compilation took 0.7249307632446289s.
-[triton-dejavu] First execution including JIT compilation took 5.080144166946411s.
-[triton-dejavu] First execution including JIT compilation took 1.7896246910095215s.
-[triton-dejavu] First execution including JIT compilation took 0.7319927215576172s.
-[triton-dejavu] First execution including JIT compilation took 10.777840614318848s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 5.033360242843628s.
-[triton-dejavu] First execution including JIT compilation took 1.410045862197876s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1376256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1376256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.680861473083496s.
-[triton-dejavu] First execution including JIT compilation took 2.364461898803711s.
-[triton-dejavu] First execution including JIT compilation took 0.8534502983093262s.
-[triton-dejavu] First execution including JIT compilation took 4.708017349243164s.
-[triton-dejavu] First execution including JIT compilation took 2.841503858566284s.
-[triton-dejavu] First execution including JIT compilation took 1.0484719276428223s.
-[triton-dejavu] First execution including JIT compilation took 4.7807886600494385s.
-[triton-dejavu] First execution including JIT compilation took 2.8980062007904053s.
-[triton-dejavu] First execution including JIT compilation took 1.0707988739013672s.
-[triton-dejavu] First execution including JIT compilation took 4.607600212097168s.
-[triton-dejavu] First execution including JIT compilation took 2.8636832237243652s.
-[triton-dejavu] First execution including JIT compilation took 1.1431879997253418s.
-[triton-dejavu] First execution including JIT compilation took 4.923970699310303s.
-[triton-dejavu] First execution including JIT compilation took 2.79614520072937s.
-[triton-dejavu] First execution including JIT compilation took 1.0749492645263672s.
-[triton-dejavu] First execution including JIT compilation took 4.696893692016602s.
-[triton-dejavu] First execution including JIT compilation took 2.8622703552246094s.
-[triton-dejavu] First execution including JIT compilation took 1.0982391834259033s.
-[triton-dejavu] First execution including JIT compilation took 4.7404444217681885s.
-[triton-dejavu] First execution including JIT compilation took 2.878173828125s.
-[triton-dejavu] First execution including JIT compilation took 1.1065995693206787s.
-[triton-dejavu] First execution including JIT compilation took 4.991016626358032s.
-[triton-dejavu] First execution including JIT compilation took 2.5021591186523438s.
-[triton-dejavu] First execution including JIT compilation took 0.9695248603820801s.
-[triton-dejavu] First execution including JIT compilation took 5.3018670082092285s.
-[triton-dejavu] First execution including JIT compilation took 3.273489236831665s.
-[triton-dejavu] First execution including JIT compilation took 1.181260108947754s.
-[triton-dejavu] First execution including JIT compilation took 5.431257247924805s.
-[triton-dejavu] First execution including JIT compilation took 3.352473497390747s.
-[triton-dejavu] First execution including JIT compilation took 1.186856985092163s.
-[triton-dejavu] First execution including JIT compilation took 5.393920183181763s.
-[triton-dejavu] First execution including JIT compilation took 3.40191650390625s.
-[triton-dejavu] First execution including JIT compilation took 1.1941492557525635s.
-[triton-dejavu] First execution including JIT compilation took 5.543420314788818s.
-[triton-dejavu] First execution including JIT compilation took 3.3016717433929443s.
-[triton-dejavu] First execution including JIT compilation took 1.2081632614135742s.
-[triton-dejavu] First execution including JIT compilation took 5.640880107879639s.
-[triton-dejavu] First execution including JIT compilation took 3.5443694591522217s.
-[triton-dejavu] First execution including JIT compilation took 1.3958439826965332s.
-[triton-dejavu] First execution including JIT compilation took 5.6015305519104s.
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 6.935962200164795s.
-[triton-dejavu] First execution including JIT compilation took 3.3080406188964844s.
-[triton-dejavu] First execution including JIT compilation took 1.2709336280822754s.
-[triton-dejavu] First execution including JIT compilation took 7.072402715682983s.
-[triton-dejavu] First execution including JIT compilation took 3.7861485481262207s.
-[triton-dejavu] First execution including JIT compilation took 1.4361011981964111s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 5.464360475540161s.
-[triton-dejavu] First execution including JIT compilation took 1.6160335540771484s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 5.580164670944214s.
-[triton-dejavu] First execution including JIT compilation took 2.2763874530792236s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 917504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 917504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1835008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1835008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2097152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2097152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2097152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2097152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] added BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None for _bmm_chunk_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default and key ('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')
-[2025-07-23 17:21:31] Triton autotuning for function _bmm_chunk_fwd_kernel finished after 10756.57s; best config selected: BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None with benchmark time 0.002230335958302021;  evaluated 2625 configurations;
-[triton-dejavu] ('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16') not in cache, starting to tune...
-[triton-dejavu] [2025-07-23 17:21:31]  Started benchmarking of 2625 configurations... (use_bo: False, run: 0)
-[triton-dejavu] First execution including JIT compilation took 0.30918288230895996s.
-[triton-dejavu] First execution including JIT compilation took 0.2933952808380127s.
-[triton-dejavu] First execution including JIT compilation took 0.25191783905029297s.
-[triton-dejavu] First execution including JIT compilation took 0.3722970485687256s.
-[triton-dejavu] First execution including JIT compilation took 0.29223203659057617s.
-[triton-dejavu] First execution including JIT compilation took 0.2751960754394531s.
-[triton-dejavu] First execution including JIT compilation took 0.33064961433410645s.
-[triton-dejavu] First execution including JIT compilation took 0.3094618320465088s.
-[triton-dejavu] First execution including JIT compilation took 0.28058433532714844s.
-[triton-dejavu] First execution including JIT compilation took 0.4254286289215088s.
-[triton-dejavu] First execution including JIT compilation took 0.40548038482666016s.
-[triton-dejavu] First execution including JIT compilation took 0.3699519634246826s.
-[triton-dejavu] First execution including JIT compilation took 0.44558167457580566s.
-[triton-dejavu] First execution including JIT compilation took 0.4306924343109131s.
-[triton-dejavu] First execution including JIT compilation took 0.39330148696899414s.
-[triton-dejavu] First execution including JIT compilation took 0.46187448501586914s.
-[triton-dejavu] First execution including JIT compilation took 0.42760252952575684s.
-[triton-dejavu] First execution including JIT compilation took 0.3771791458129883s.
-[triton-dejavu] First execution including JIT compilation took 0.4732646942138672s.
-[triton-dejavu] First execution including JIT compilation took 0.45160865783691406s.
-[triton-dejavu] First execution including JIT compilation took 0.4094536304473877s.
-[triton-dejavu] First execution including JIT compilation took 0.3954737186431885s.
-[triton-dejavu] First execution including JIT compilation took 0.3627440929412842s.
-[triton-dejavu] First execution including JIT compilation took 0.3488888740539551s.
-[triton-dejavu] First execution including JIT compilation took 0.43737292289733887s.
-[triton-dejavu] First execution including JIT compilation took 0.4030463695526123s.
-[triton-dejavu] First execution including JIT compilation took 0.3850533962249756s.
-[triton-dejavu] First execution including JIT compilation took 0.5620872974395752s.
-[triton-dejavu] First execution including JIT compilation took 0.43325042724609375s.
-[triton-dejavu] First execution including JIT compilation took 0.39774608612060547s.
-[triton-dejavu] First execution including JIT compilation took 0.4676651954650879s.
-[triton-dejavu] First execution including JIT compilation took 0.4356362819671631s.
-[triton-dejavu] First execution including JIT compilation took 0.601660966873169s.
-[triton-dejavu] First execution including JIT compilation took 0.4965968132019043s.
-[triton-dejavu] First execution including JIT compilation took 0.43747496604919434s.
-[triton-dejavu] First execution including JIT compilation took 0.4154806137084961s.
-[triton-dejavu] First execution including JIT compilation took 0.4810163974761963s.
-[triton-dejavu] First execution including JIT compilation took 0.5712161064147949s.
-[triton-dejavu] First execution including JIT compilation took 0.42205166816711426s.
-[triton-dejavu] First execution including JIT compilation took 0.5164515972137451s.
-[triton-dejavu] First execution including JIT compilation took 0.46834373474121094s.
-[triton-dejavu] First execution including JIT compilation took 0.6180143356323242s.
-[triton-dejavu] First execution including JIT compilation took 0.44220447540283203s.
-[triton-dejavu] First execution including JIT compilation took 0.38592958450317383s.
-[triton-dejavu] First execution including JIT compilation took 0.36253952980041504s.
-[triton-dejavu] First execution including JIT compilation took 0.5079498291015625s.
-[triton-dejavu] First execution including JIT compilation took 0.42972564697265625s.
-[triton-dejavu] First execution including JIT compilation took 0.44005632400512695s.
-[triton-dejavu] First execution including JIT compilation took 0.5287299156188965s.
-[triton-dejavu] First execution including JIT compilation took 0.44796276092529297s.
-[triton-dejavu] First execution including JIT compilation took 0.4164867401123047s.
-[triton-dejavu] First execution including JIT compilation took 0.6508886814117432s.
-[triton-dejavu] First execution including JIT compilation took 0.45914721488952637s.
-[triton-dejavu] First execution including JIT compilation took 0.41272830963134766s.
-[triton-dejavu] First execution including JIT compilation took 0.5432581901550293s.
-[triton-dejavu] First execution including JIT compilation took 0.47520899772644043s.
-[triton-dejavu] First execution including JIT compilation took 0.6470954418182373s.
-[triton-dejavu] First execution including JIT compilation took 0.5783417224884033s.
-[triton-dejavu] First execution including JIT compilation took 0.4790058135986328s.
-[triton-dejavu] First execution including JIT compilation took 0.4519171714782715s.
-[triton-dejavu] First execution including JIT compilation took 0.6798951625823975s.
-[triton-dejavu] First execution including JIT compilation took 0.5275766849517822s.
-[triton-dejavu] First execution including JIT compilation took 0.4806857109069824s.
-[triton-dejavu] First execution including JIT compilation took 0.5353171825408936s.
-[triton-dejavu] First execution including JIT compilation took 0.46978282928466797s.
-[triton-dejavu] First execution including JIT compilation took 0.39371633529663086s.
-[triton-dejavu] First execution including JIT compilation took 0.5798892974853516s.
-[triton-dejavu] First execution including JIT compilation took 0.46941256523132324s.
-[triton-dejavu] First execution including JIT compilation took 0.42421650886535645s.
-[triton-dejavu] First execution including JIT compilation took 0.7001688480377197s.
-[triton-dejavu] First execution including JIT compilation took 0.48505401611328125s.
-[triton-dejavu] First execution including JIT compilation took 0.43085551261901855s.
-[triton-dejavu] First execution including JIT compilation took 0.6240184307098389s.
-[triton-dejavu] First execution including JIT compilation took 0.5429472923278809s.
-[triton-dejavu] First execution including JIT compilation took 0.456082820892334s.
-[triton-dejavu] First execution including JIT compilation took 0.6970863342285156s.
-[triton-dejavu] First execution including JIT compilation took 0.5338778495788574s.
-[triton-dejavu] First execution including JIT compilation took 0.5650749206542969s.
-[triton-dejavu] First execution including JIT compilation took 0.7317397594451904s.
-[triton-dejavu] First execution including JIT compilation took 0.5815334320068359s.
-[triton-dejavu] First execution including JIT compilation took 0.5088152885437012s.
-[triton-dejavu] First execution including JIT compilation took 0.8137404918670654s.
-[triton-dejavu] First execution including JIT compilation took 0.6120672225952148s.
-[triton-dejavu] First execution including JIT compilation took 0.5259246826171875s.
-[triton-dejavu] First execution including JIT compilation took 0.7127907276153564s.
-[triton-dejavu] First execution including JIT compilation took 0.5280823707580566s.
-[triton-dejavu] First execution including JIT compilation took 0.44465160369873047s.
-[triton-dejavu] First execution including JIT compilation took 0.8294477462768555s.
-[triton-dejavu] First execution including JIT compilation took 0.5809340476989746s.
-[triton-dejavu] First execution including JIT compilation took 0.5024135112762451s.
-[triton-dejavu] First execution including JIT compilation took 0.8774230480194092s.
-[triton-dejavu] First execution including JIT compilation took 0.7163739204406738s.
-[triton-dejavu] First execution including JIT compilation took 0.49521970748901367s.
-[triton-dejavu] First execution including JIT compilation took 0.9411158561706543s.
-[triton-dejavu] First execution including JIT compilation took 0.681549072265625s.
-[triton-dejavu] First execution including JIT compilation took 0.7356657981872559s.
-[triton-dejavu] First execution including JIT compilation took 1.034416913986206s.
-[triton-dejavu] First execution including JIT compilation took 0.731208324432373s.
-[triton-dejavu] First execution including JIT compilation took 0.5860607624053955s.
-[triton-dejavu] First execution including JIT compilation took 1.1468729972839355s.
-[triton-dejavu] First execution including JIT compilation took 0.7314703464508057s.
-[triton-dejavu] First execution including JIT compilation took 0.5886971950531006s.
-[triton-dejavu] First execution including JIT compilation took 1.10198974609375s.
-[triton-dejavu] First execution including JIT compilation took 0.7672359943389893s.
-[triton-dejavu] First execution including JIT compilation took 0.6245870590209961s.
-[triton-dejavu] First execution including JIT compilation took 0.4356365203857422s.
-[triton-dejavu] First execution including JIT compilation took 0.33637166023254395s.
-[triton-dejavu] First execution including JIT compilation took 0.32270336151123047s.
-[triton-dejavu] First execution including JIT compilation took 0.43533897399902344s.
-[triton-dejavu] First execution including JIT compilation took 0.37600111961364746s.
-[triton-dejavu] First execution including JIT compilation took 0.39075493812561035s.
-[triton-dejavu] First execution including JIT compilation took 0.5955746173858643s.
-[triton-dejavu] First execution including JIT compilation took 0.420551061630249s.
-[triton-dejavu] First execution including JIT compilation took 0.38468456268310547s.
-[triton-dejavu] First execution including JIT compilation took 0.48545384407043457s.
-[triton-dejavu] First execution including JIT compilation took 0.43158817291259766s.
-[triton-dejavu] First execution including JIT compilation took 0.42005348205566406s.
-[triton-dejavu] First execution including JIT compilation took 0.5997962951660156s.
-[triton-dejavu] First execution including JIT compilation took 0.4429283142089844s.
-[triton-dejavu] First execution including JIT compilation took 0.40537381172180176s.
-[triton-dejavu] First execution including JIT compilation took 0.5108773708343506s.
-[triton-dejavu] First execution including JIT compilation took 0.4490795135498047s.
-[triton-dejavu] First execution including JIT compilation took 0.4208858013153076s.
-[triton-dejavu] First execution including JIT compilation took 0.6792380809783936s.
-[triton-dejavu] First execution including JIT compilation took 0.467818021774292s.
-[triton-dejavu] First execution including JIT compilation took 0.4417719841003418s.
-[triton-dejavu] First execution including JIT compilation took 0.4436028003692627s.
-[triton-dejavu] First execution including JIT compilation took 0.3732438087463379s.
-[triton-dejavu] First execution including JIT compilation took 0.3606081008911133s.
-[triton-dejavu] First execution including JIT compilation took 0.4783363342285156s.
-[triton-dejavu] First execution including JIT compilation took 0.40464305877685547s.
-[triton-dejavu] First execution including JIT compilation took 0.38185811042785645s.
-[triton-dejavu] First execution including JIT compilation took 0.5133819580078125s.
-[triton-dejavu] First execution including JIT compilation took 0.43381595611572266s.
-[triton-dejavu] First execution including JIT compilation took 0.42664098739624023s.
-[triton-dejavu] First execution including JIT compilation took 0.5179893970489502s.
-[triton-dejavu] First execution including JIT compilation took 0.46022605895996094s.
-[triton-dejavu] First execution including JIT compilation took 0.4134035110473633s.
-[triton-dejavu] First execution including JIT compilation took 0.5401298999786377s.
-[triton-dejavu] First execution including JIT compilation took 0.4478724002838135s.
-[triton-dejavu] First execution including JIT compilation took 0.42383623123168945s.
-[triton-dejavu] First execution including JIT compilation took 0.5332937240600586s.
-[triton-dejavu] First execution including JIT compilation took 0.49991607666015625s.
-[triton-dejavu] First execution including JIT compilation took 0.41617631912231445s.
-[triton-dejavu] First execution including JIT compilation took 0.601402759552002s.
-[triton-dejavu] First execution including JIT compilation took 0.4842853546142578s.
-[triton-dejavu] First execution including JIT compilation took 0.4743378162384033s.
-[triton-dejavu] First execution including JIT compilation took 0.4953763484954834s.
-[triton-dejavu] First execution including JIT compilation took 0.4030179977416992s.
-[triton-dejavu] First execution including JIT compilation took 0.37740230560302734s.
-[triton-dejavu] First execution including JIT compilation took 0.5496475696563721s.
-[triton-dejavu] First execution including JIT compilation took 0.4513847827911377s.
-[triton-dejavu] First execution including JIT compilation took 0.43414807319641113s.
-[triton-dejavu] First execution including JIT compilation took 0.5767252445220947s.
-[triton-dejavu] First execution including JIT compilation took 0.4709329605102539s.
-[triton-dejavu] First execution including JIT compilation took 0.41281580924987793s.
-[triton-dejavu] First execution including JIT compilation took 0.6148693561553955s.
-[triton-dejavu] First execution including JIT compilation took 0.6631579399108887s.
-[triton-dejavu] First execution including JIT compilation took 0.42415881156921387s.
-[triton-dejavu] First execution including JIT compilation took 0.6116061210632324s.
-[triton-dejavu] First execution including JIT compilation took 0.5052735805511475s.
-[triton-dejavu] First execution including JIT compilation took 0.42380595207214355s.
-[triton-dejavu] First execution including JIT compilation took 0.9918599128723145s.
-[triton-dejavu] First execution including JIT compilation took 0.4996817111968994s.
-[triton-dejavu] First execution including JIT compilation took 0.44182300567626953s.
-[triton-dejavu] First execution including JIT compilation took 0.69724440574646s.
-[triton-dejavu] First execution including JIT compilation took 0.6008899211883545s.
-[triton-dejavu] First execution including JIT compilation took 0.48003244400024414s.
-[triton-dejavu] First execution including JIT compilation took 0.6034128665924072s.
-[triton-dejavu] First execution including JIT compilation took 0.49086999893188477s.
-[triton-dejavu] First execution including JIT compilation took 0.40519237518310547s.
-[triton-dejavu] First execution including JIT compilation took 0.6755588054656982s.
-[triton-dejavu] First execution including JIT compilation took 0.48955368995666504s.
-[triton-dejavu] First execution including JIT compilation took 0.4392104148864746s.
-[triton-dejavu] First execution including JIT compilation took 0.7415237426757812s.
-[triton-dejavu] First execution including JIT compilation took 0.5113849639892578s.
-[triton-dejavu] First execution including JIT compilation took 0.44628405570983887s.
-[triton-dejavu] First execution including JIT compilation took 0.730881929397583s.
-[triton-dejavu] First execution including JIT compilation took 0.538200855255127s.
-[triton-dejavu] First execution including JIT compilation took 0.45828986167907715s.
-[triton-dejavu] First execution including JIT compilation took 0.8166990280151367s.
-[triton-dejavu] First execution including JIT compilation took 0.5725693702697754s.
-[triton-dejavu] First execution including JIT compilation took 0.42383289337158203s.
-[triton-dejavu] First execution including JIT compilation took 0.7243595123291016s.
-[triton-dejavu] First execution including JIT compilation took 0.578228235244751s.
-[triton-dejavu] First execution including JIT compilation took 0.3952150344848633s.
-[triton-dejavu] First execution including JIT compilation took 0.7390725612640381s.
-[triton-dejavu] First execution including JIT compilation took 0.6426718235015869s.
-[triton-dejavu] First execution including JIT compilation took 0.4642622470855713s.
-[triton-dejavu] First execution including JIT compilation took 0.7362449169158936s.
-[triton-dejavu] First execution including JIT compilation took 0.4661426544189453s.
-[triton-dejavu] First execution including JIT compilation took 0.37535643577575684s.
-[triton-dejavu] First execution including JIT compilation took 0.962334156036377s.
-[triton-dejavu] First execution including JIT compilation took 0.7253522872924805s.
-[triton-dejavu] First execution including JIT compilation took 0.5154576301574707s.
-[triton-dejavu] First execution including JIT compilation took 1.1596920490264893s.
-[triton-dejavu] First execution including JIT compilation took 0.693903923034668s.
-[triton-dejavu] First execution including JIT compilation took 0.542579174041748s.
-[triton-dejavu] First execution including JIT compilation took 1.143411636352539s.
-[triton-dejavu] First execution including JIT compilation took 0.5730850696563721s.
-[triton-dejavu] First execution including JIT compilation took 0.45581793785095215s.
-[triton-dejavu] First execution including JIT compilation took 1.1371288299560547s.
-[triton-dejavu] First execution including JIT compilation took 0.6347072124481201s.
-[triton-dejavu] First execution including JIT compilation took 0.5362868309020996s.
-[triton-dejavu] First execution including JIT compilation took 1.1827235221862793s.
-[triton-dejavu] First execution including JIT compilation took 0.6695809364318848s.
-[triton-dejavu] First execution including JIT compilation took 0.594818115234375s.
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.4952049255371094s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.38374853134155273s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.34327268600463867s.
-[triton-dejavu] First execution including JIT compilation took 0.5115656852722168s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.41890788078308105s.
-[triton-dejavu] First execution including JIT compilation took 0.36414170265197754s.
-[triton-dejavu] First execution including JIT compilation took 0.551964282989502s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.43802404403686523s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3844606876373291s.
-[triton-dejavu] First execution including JIT compilation took 0.5653436183929443s.
-[triton-dejavu] First execution including JIT compilation took 0.4929649829864502s.
-[triton-dejavu] First execution including JIT compilation took 0.4082679748535156s.
-[triton-dejavu] First execution including JIT compilation took 0.6021175384521484s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.479703426361084s.
-[triton-dejavu] First execution including JIT compilation took 0.4264242649078369s.
-[triton-dejavu] First execution including JIT compilation took 0.625530481338501s.
-[triton-dejavu] First execution including JIT compilation took 0.5222625732421875s.
-[triton-dejavu] First execution including JIT compilation took 0.4469916820526123s.
-[triton-dejavu] First execution including JIT compilation took 0.6354126930236816s.
-[triton-dejavu] First execution including JIT compilation took 0.5203869342803955s.
-[triton-dejavu] First execution including JIT compilation took 0.46642088890075684s.
-[triton-dejavu] First execution including JIT compilation took 0.588517427444458s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.49373459815979004s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3735010623931885s.
-[triton-dejavu] First execution including JIT compilation took 0.5814676284790039s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4442164897918701s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3985586166381836s.
-[triton-dejavu] First execution including JIT compilation took 0.6262340545654297s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.47362422943115234s.
-[triton-dejavu] First execution including JIT compilation took 0.41541028022766113s.
-[triton-dejavu] First execution including JIT compilation took 0.8599624633789062s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6292743682861328s.
-[triton-dejavu] First execution including JIT compilation took 0.4856271743774414s.
-[triton-dejavu] First execution including JIT compilation took 0.6879935264587402s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5188641548156738s.
-[triton-dejavu] First execution including JIT compilation took 0.4481801986694336s.
-[triton-dejavu] First execution including JIT compilation took 0.682525634765625s.
-[triton-dejavu] First execution including JIT compilation took 0.603325605392456s.
-[triton-dejavu] First execution including JIT compilation took 0.45879626274108887s.
-[triton-dejavu] First execution including JIT compilation took 0.7078754901885986s.
-[triton-dejavu] First execution including JIT compilation took 0.5560562610626221s.
-[triton-dejavu] First execution including JIT compilation took 0.45784831047058105s.
-[triton-dejavu] First execution including JIT compilation took 0.67889404296875s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.45562100410461426s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.38816308975219727s.
-[triton-dejavu] First execution including JIT compilation took 0.6418576240539551s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.48399782180786133s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.41680216789245605s.
-[triton-dejavu] First execution including JIT compilation took 0.7157330513000488s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5226426124572754s.
-[triton-dejavu] First execution including JIT compilation took 0.44080543518066406s.
-[triton-dejavu] First execution including JIT compilation took 0.7858779430389404s.
-[triton-dejavu] First execution including JIT compilation took 0.5671470165252686s.
-[triton-dejavu] First execution including JIT compilation took 0.45592260360717773s.
-[triton-dejavu] First execution including JIT compilation took 0.8100578784942627s.
-[triton-dejavu] First execution including JIT compilation took 0.6213173866271973s.
-[triton-dejavu] First execution including JIT compilation took 0.47237181663513184s.
-[triton-dejavu] First execution including JIT compilation took 0.7891368865966797s.
-[triton-dejavu] First execution including JIT compilation took 0.6662912368774414s.
-[triton-dejavu] First execution including JIT compilation took 0.4879744052886963s.
-[triton-dejavu] First execution including JIT compilation took 0.731757640838623s.
-[triton-dejavu] First execution including JIT compilation took 0.4918680191040039s.
-[triton-dejavu] First execution including JIT compilation took 0.37989187240600586s.
-[triton-dejavu] First execution including JIT compilation took 0.6664383411407471s.
-[triton-dejavu] First execution including JIT compilation took 0.4817678928375244s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3559696674346924s.
-[triton-dejavu] First execution including JIT compilation took 0.8642761707305908s.
-[triton-dejavu] First execution including JIT compilation took 0.5662164688110352s.
-[triton-dejavu] First execution including JIT compilation took 0.45751142501831055s.
-[triton-dejavu] First execution including JIT compilation took 0.9735383987426758s.
-[triton-dejavu] First execution including JIT compilation took 0.6600606441497803s.
-[triton-dejavu] First execution including JIT compilation took 0.48941469192504883s.
-[triton-dejavu] First execution including JIT compilation took 1.0599989891052246s.
-[triton-dejavu] First execution including JIT compilation took 0.5858447551727295s.
-[triton-dejavu] First execution including JIT compilation took 0.40030384063720703s.
-[triton-dejavu] First execution including JIT compilation took 0.9032082557678223s.
-[triton-dejavu] First execution including JIT compilation took 0.5963606834411621s.
-[triton-dejavu] First execution including JIT compilation took 0.5698938369750977s.
-[triton-dejavu] First execution including JIT compilation took 0.9204597473144531s.
-[triton-dejavu] First execution including JIT compilation took 0.7513656616210938s.
-[triton-dejavu] First execution including JIT compilation took 0.5392777919769287s.
-[triton-dejavu] First execution including JIT compilation took 1.3184521198272705s.
-[triton-dejavu] First execution including JIT compilation took 0.7888948917388916s.
-[triton-dejavu] First execution including JIT compilation took 0.6177425384521484s.
-[triton-dejavu] First execution including JIT compilation took 1.1905827522277832s.
-[triton-dejavu] First execution including JIT compilation took 0.7364373207092285s.
-[triton-dejavu] First execution including JIT compilation took 0.5242094993591309s.
-[triton-dejavu] First execution including JIT compilation took 1.2864527702331543s.
-[triton-dejavu] First execution including JIT compilation took 0.8166484832763672s.
-[triton-dejavu] First execution including JIT compilation took 0.5594861507415771s.
-[triton-dejavu] First execution including JIT compilation took 1.8655834197998047s.
-[triton-dejavu] First execution including JIT compilation took 0.9145352840423584s.
-[triton-dejavu] First execution including JIT compilation took 0.6113896369934082s.
-[triton-dejavu] First execution including JIT compilation took 1.9301745891571045s.
-[triton-dejavu] First execution including JIT compilation took 1.0628697872161865s.
-[triton-dejavu] First execution including JIT compilation took 0.64133620262146s.
-[triton-dejavu] First execution including JIT compilation took 2.2749366760253906s.
-[triton-dejavu] First execution including JIT compilation took 1.0524189472198486s.
-[triton-dejavu] First execution including JIT compilation took 0.677316427230835s.
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.6699347496032715s.
-[triton-dejavu] First execution including JIT compilation took 0.39471912384033203s.
-[triton-dejavu] First execution including JIT compilation took 0.31299400329589844s.
-[triton-dejavu] First execution including JIT compilation took 0.9980213642120361s.
-[triton-dejavu] First execution including JIT compilation took 0.4564340114593506s.
-[triton-dejavu] First execution including JIT compilation took 0.39405226707458496s.
-[triton-dejavu] First execution including JIT compilation took 0.721914529800415s.
-[triton-dejavu] First execution including JIT compilation took 0.5424695014953613s.
-[triton-dejavu] First execution including JIT compilation took 0.41809797286987305s.
-[triton-dejavu] First execution including JIT compilation took 0.7378096580505371s.
-[triton-dejavu] First execution including JIT compilation took 0.538069486618042s.
-[triton-dejavu] First execution including JIT compilation took 0.43320608139038086s.
-[triton-dejavu] First execution including JIT compilation took 0.8680074214935303s.
-[triton-dejavu] First execution including JIT compilation took 0.5815584659576416s.
-[triton-dejavu] First execution including JIT compilation took 0.44110822677612305s.
-[triton-dejavu] First execution including JIT compilation took 0.797199010848999s.
-[triton-dejavu] First execution including JIT compilation took 0.7567603588104248s.
-[triton-dejavu] First execution including JIT compilation took 0.47153782844543457s.
-[triton-dejavu] First execution including JIT compilation took 0.8809914588928223s.
-[triton-dejavu] First execution including JIT compilation took 0.6448085308074951s.
-[triton-dejavu] First execution including JIT compilation took 0.5167965888977051s.
-[triton-dejavu] First execution including JIT compilation took 0.745863676071167s.
-[triton-dejavu] First execution including JIT compilation took 0.5225260257720947s.
-[triton-dejavu] First execution including JIT compilation took 0.4189014434814453s.
-[triton-dejavu] First execution including JIT compilation took 0.7760834693908691s.
-[triton-dejavu] First execution including JIT compilation took 0.5539810657501221s.
-[triton-dejavu] First execution including JIT compilation took 0.44478821754455566s.
-[triton-dejavu] First execution including JIT compilation took 0.8012809753417969s.
-[triton-dejavu] First execution including JIT compilation took 0.6483604907989502s.
-[triton-dejavu] First execution including JIT compilation took 0.4678480625152588s.
-[triton-dejavu] First execution including JIT compilation took 0.8454635143280029s.
-[triton-dejavu] First execution including JIT compilation took 0.6168031692504883s.
-[triton-dejavu] First execution including JIT compilation took 0.6612381935119629s.
-[triton-dejavu] First execution including JIT compilation took 0.8955931663513184s.
-[triton-dejavu] First execution including JIT compilation took 0.6233341693878174s.
-[triton-dejavu] First execution including JIT compilation took 0.4918363094329834s.
-[triton-dejavu] First execution including JIT compilation took 0.9699711799621582s.
-[triton-dejavu] First execution including JIT compilation took 0.6717431545257568s.
-[triton-dejavu] First execution including JIT compilation took 0.5321164131164551s.
-[triton-dejavu] First execution including JIT compilation took 1.006484031677246s.
-[triton-dejavu] First execution including JIT compilation took 0.7167990207672119s.
-[triton-dejavu] First execution including JIT compilation took 0.5426886081695557s.
-[triton-dejavu] First execution including JIT compilation took 0.8549697399139404s.
-[triton-dejavu] First execution including JIT compilation took 0.5004158020019531s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42880892753601074s.
-[triton-dejavu] First execution including JIT compilation took 0.8870432376861572s.
-[triton-dejavu] First execution including JIT compilation took 0.5380675792694092s.
-[triton-dejavu] First execution including JIT compilation took 0.44513392448425293s.
-[triton-dejavu] First execution including JIT compilation took 1.0106170177459717s.
-[triton-dejavu] First execution including JIT compilation took 0.6438839435577393s.
-[triton-dejavu] First execution including JIT compilation took 0.48810815811157227s.
-[triton-dejavu] First execution including JIT compilation took 1.1047391891479492s.
-[triton-dejavu] First execution including JIT compilation took 0.6829500198364258s.
-[triton-dejavu] First execution including JIT compilation took 0.5343265533447266s.
-[triton-dejavu] First execution including JIT compilation took 1.1722900867462158s.
-[triton-dejavu] First execution including JIT compilation took 0.7511520385742188s.
-[triton-dejavu] First execution including JIT compilation took 0.5391092300415039s.
-[triton-dejavu] First execution including JIT compilation took 1.2446460723876953s.
-[triton-dejavu] First execution including JIT compilation took 0.7718749046325684s.
-[triton-dejavu] First execution including JIT compilation took 0.549095630645752s.
-[triton-dejavu] First execution including JIT compilation took 1.3546397686004639s.
-[triton-dejavu] First execution including JIT compilation took 0.7892618179321289s.
-[triton-dejavu] First execution including JIT compilation took 0.46314549446105957s.
-[triton-dejavu] First execution including JIT compilation took 0.9860119819641113s.
-[triton-dejavu] First execution including JIT compilation took 0.8724544048309326s.
-[triton-dejavu] First execution including JIT compilation took 0.4373140335083008s.
-[triton-dejavu] First execution including JIT compilation took 1.0243175029754639s.
-[triton-dejavu] First execution including JIT compilation took 0.6186015605926514s.
-[triton-dejavu] First execution including JIT compilation took 0.4280831813812256s.
-[triton-dejavu] First execution including JIT compilation took 1.5726463794708252s.
-[triton-dejavu] First execution including JIT compilation took 0.9008209705352783s.
-[triton-dejavu] First execution including JIT compilation took 0.44704723358154297s.
-[triton-dejavu] First execution including JIT compilation took 1.6724953651428223s.
-[triton-dejavu] First execution including JIT compilation took 0.8446671962738037s.
-[triton-dejavu] First execution including JIT compilation took 0.4729273319244385s.
-[triton-dejavu] First execution including JIT compilation took 1.7400548458099365s.
-[triton-dejavu] First execution including JIT compilation took 0.8373644351959229s.
-[triton-dejavu] First execution including JIT compilation took 0.5513191223144531s.
-[triton-dejavu] First execution including JIT compilation took 1.8062167167663574s.
-[triton-dejavu] First execution including JIT compilation took 0.8943934440612793s.
-[triton-dejavu] First execution including JIT compilation took 0.5011510848999023s.
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.683664321899414s.
-[triton-dejavu] First execution including JIT compilation took 0.8943469524383545s.
-[triton-dejavu] First execution including JIT compilation took 0.5135440826416016s.
-[triton-dejavu] First execution including JIT compilation took 1.9431862831115723s.
-[triton-dejavu] First execution including JIT compilation took 1.2609891891479492s.
-[triton-dejavu] First execution including JIT compilation took 0.8276565074920654s.
-[triton-dejavu] First execution including JIT compilation took 6.717592477798462s.
-[triton-dejavu] First execution including JIT compilation took 1.4402704238891602s.
-[triton-dejavu] First execution including JIT compilation took 0.6466906070709229s.
-bench_cudagraph failed with out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.9890782833099365s.
-[triton-dejavu] First execution including JIT compilation took 0.6056652069091797s.
-[triton-dejavu] First execution including JIT compilation took 0.37941598892211914s.
-[triton-dejavu] First execution including JIT compilation took 0.9751529693603516s.
-[triton-dejavu] First execution including JIT compilation took 0.5807168483734131s.
-[triton-dejavu] First execution including JIT compilation took 0.46853089332580566s.
-[triton-dejavu] First execution including JIT compilation took 1.0114789009094238s.
-[triton-dejavu] First execution including JIT compilation took 0.6656544208526611s.
-[triton-dejavu] First execution including JIT compilation took 0.4591987133026123s.
-[triton-dejavu] First execution including JIT compilation took 1.2622675895690918s.
-[triton-dejavu] First execution including JIT compilation took 0.7641468048095703s.
-[triton-dejavu] First execution including JIT compilation took 0.5717291831970215s.
-[triton-dejavu] First execution including JIT compilation took 1.4783613681793213s.
-[triton-dejavu] First execution including JIT compilation took 0.674863338470459s.
-[triton-dejavu] First execution including JIT compilation took 0.4595465660095215s.
-[triton-dejavu] First execution including JIT compilation took 1.3642892837524414s.
-[triton-dejavu] First execution including JIT compilation took 0.6485683917999268s.
-[triton-dejavu] First execution including JIT compilation took 0.48433613777160645s.
-[triton-dejavu] First execution including JIT compilation took 1.2521538734436035s.
-[triton-dejavu] First execution including JIT compilation took 0.6855838298797607s.
-[triton-dejavu] First execution including JIT compilation took 0.5421812534332275s.
-[triton-dejavu] First execution including JIT compilation took 1.2270712852478027s.
-[triton-dejavu] First execution including JIT compilation took 0.5855932235717773s.
-[triton-dejavu] First execution including JIT compilation took 0.3942844867706299s.
-[triton-dejavu] First execution including JIT compilation took 1.0544099807739258s.
-[triton-dejavu] First execution including JIT compilation took 0.6344761848449707s.
-[triton-dejavu] First execution including JIT compilation took 0.4506490230560303s.
-[triton-dejavu] First execution including JIT compilation took 1.6647655963897705s.
-[triton-dejavu] First execution including JIT compilation took 0.7378494739532471s.
-[triton-dejavu] First execution including JIT compilation took 0.4730367660522461s.
-[triton-dejavu] First execution including JIT compilation took 1.290454387664795s.
-[triton-dejavu] First execution including JIT compilation took 0.75484299659729s.
-[triton-dejavu] First execution including JIT compilation took 0.483842134475708s.
-[triton-dejavu] First execution including JIT compilation took 1.2890782356262207s.
-[triton-dejavu] First execution including JIT compilation took 0.7153482437133789s.
-[triton-dejavu] First execution including JIT compilation took 0.5798866748809814s.
-[triton-dejavu] First execution including JIT compilation took 1.450512170791626s.
-[triton-dejavu] First execution including JIT compilation took 0.7689251899719238s.
-[triton-dejavu] First execution including JIT compilation took 0.5422661304473877s.
-[triton-dejavu] First execution including JIT compilation took 1.3612980842590332s.
-[triton-dejavu] First execution including JIT compilation took 0.7846195697784424s.
-[triton-dejavu] First execution including JIT compilation took 0.6903204917907715s.
-[triton-dejavu] First execution including JIT compilation took 1.3367185592651367s.
-[triton-dejavu] First execution including JIT compilation took 0.7425005435943604s.
-[triton-dejavu] First execution including JIT compilation took 0.5718681812286377s.
-[triton-dejavu] First execution including JIT compilation took 1.687856674194336s.
-[triton-dejavu] First execution including JIT compilation took 0.6989390850067139s.
-[triton-dejavu] First execution including JIT compilation took 0.45311927795410156s.
-[triton-dejavu] First execution including JIT compilation took 1.8834350109100342s.
-[triton-dejavu] First execution including JIT compilation took 0.7894504070281982s.
-[triton-dejavu] First execution including JIT compilation took 0.5649204254150391s.
-[triton-dejavu] First execution including JIT compilation took 2.243360757827759s.
-[triton-dejavu] First execution including JIT compilation took 0.8881011009216309s.
-[triton-dejavu] First execution including JIT compilation took 0.6061875820159912s.
-[triton-dejavu] First execution including JIT compilation took 2.1226956844329834s.
-[triton-dejavu] First execution including JIT compilation took 0.8810961246490479s.
-[triton-dejavu] First execution including JIT compilation took 0.665024995803833s.
-[triton-dejavu] First execution including JIT compilation took 2.3865857124328613s.
-[triton-dejavu] First execution including JIT compilation took 0.9415686130523682s.
-[triton-dejavu] First execution including JIT compilation took 0.6442673206329346s.
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.141575574874878s.
-[triton-dejavu] First execution including JIT compilation took 0.9174015522003174s.
-[triton-dejavu] First execution including JIT compilation took 0.5408012866973877s.
-[triton-dejavu] First execution including JIT compilation took 2.082519292831421s.
-[triton-dejavu] First execution including JIT compilation took 1.0053105354309082s.
-[triton-dejavu] First execution including JIT compilation took 0.5957515239715576s.
-[triton-dejavu] First execution including JIT compilation took 6.80155086517334s.
-[triton-dejavu] First execution including JIT compilation took 1.5097930431365967s.
-[triton-dejavu] First execution including JIT compilation took 0.9143364429473877s.
-[triton-dejavu] First execution including JIT compilation took 6.931457042694092s.
-[triton-dejavu] First execution including JIT compilation took 1.593360185623169s.
-[triton-dejavu] First execution including JIT compilation took 0.7019162178039551s.
-bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.745112180709839s.
-[triton-dejavu] First execution including JIT compilation took 1.5900769233703613s.
-[triton-dejavu] First execution including JIT compilation took 0.7577598094940186s.
-[triton-dejavu] First execution including JIT compilation took 4.816965103149414s.
-[triton-dejavu] First execution including JIT compilation took 1.6154999732971191s.
-[triton-dejavu] First execution including JIT compilation took 0.8251774311065674s.
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.4465477466583252s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3604874610900879s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3589968681335449s.
-[triton-dejavu] First execution including JIT compilation took 0.4949653148651123s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4129481315612793s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.37282252311706543s.
-[triton-dejavu] First execution including JIT compilation took 0.49967503547668457s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4030342102050781s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.38671374320983887s.
-[triton-dejavu] First execution including JIT compilation took 0.6607780456542969s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5266311168670654s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.593902587890625s.
-[triton-dejavu] First execution including JIT compilation took 0.5250449180603027s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4251673221588135s.
-[triton-dejavu] First execution including JIT compilation took 0.41414403915405273s.
-[triton-dejavu] First execution including JIT compilation took 0.5248408317565918s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.46840643882751465s.
-[triton-dejavu] First execution including JIT compilation took 0.46053147315979004s.
-[triton-dejavu] First execution including JIT compilation took 0.5807092189788818s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4862945079803467s.
-[triton-dejavu] First execution including JIT compilation took 0.4458580017089844s.
-[triton-dejavu] First execution including JIT compilation took 0.729177713394165s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.40300631523132324s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3684837818145752s.
-[triton-dejavu] First execution including JIT compilation took 0.5131044387817383s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4550745487213135s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.39257073402404785s.
-[triton-dejavu] First execution including JIT compilation took 0.5535814762115479s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4340968132019043s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4005141258239746s.
-[triton-dejavu] First execution including JIT compilation took 0.737922191619873s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5746960639953613s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.397491455078125s.
-[triton-dejavu] First execution including JIT compilation took 0.5818440914154053s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6529510021209717s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4184551239013672s.
-[triton-dejavu] First execution including JIT compilation took 0.6207692623138428s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4892761707305908s.
-[triton-dejavu] First execution including JIT compilation took 0.44497179985046387s.
-[triton-dejavu] First execution including JIT compilation took 0.635669469833374s.
-[triton-dejavu] First execution including JIT compilation took 0.5256602764129639s.
-[triton-dejavu] First execution including JIT compilation took 0.48749327659606934s.
-[triton-dejavu] First execution including JIT compilation took 0.5720036029815674s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42984604835510254s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.38751769065856934s.
-[triton-dejavu] First execution including JIT compilation took 0.6071627140045166s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4755427837371826s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3990035057067871s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.61566162109375s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4701671600341797s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4040553569793701s.
-[triton-dejavu] First execution including JIT compilation took 0.6511368751525879s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5009520053863525s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4353518486022949s.
-[triton-dejavu] First execution including JIT compilation took 0.6790196895599365s.
-[triton-dejavu] First execution including JIT compilation took 0.5248758792877197s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4504244327545166s.
-[triton-dejavu] First execution including JIT compilation took 0.7124292850494385s.
-[triton-dejavu] First execution including JIT compilation took 0.8151717185974121s.
-[triton-dejavu] First execution including JIT compilation took 0.4823343753814697s.
-[triton-dejavu] First execution including JIT compilation took 0.772719144821167s.
-[triton-dejavu] First execution including JIT compilation took 0.6169350147247314s.
-[triton-dejavu] First execution including JIT compilation took 0.5196678638458252s.
-[triton-dejavu] First execution including JIT compilation took 0.7186233997344971s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4982566833496094s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4104623794555664s.
-[triton-dejavu] First execution including JIT compilation took 1.1141915321350098s.
-[triton-dejavu] First execution including JIT compilation took 0.5356433391571045s.
-[triton-dejavu] First execution including JIT compilation took 0.4406569004058838s.
-[triton-dejavu] First execution including JIT compilation took 0.8371496200561523s.
-[triton-dejavu] First execution including JIT compilation took 0.5642838478088379s.
-[triton-dejavu] First execution including JIT compilation took 0.4726717472076416s.
-[triton-dejavu] First execution including JIT compilation took 0.93656325340271s.
-[triton-dejavu] First execution including JIT compilation took 0.6194779872894287s.
-[triton-dejavu] First execution including JIT compilation took 0.4953165054321289s.
-[triton-dejavu] First execution including JIT compilation took 0.9690747261047363s.
-[triton-dejavu] First execution including JIT compilation took 0.6501588821411133s.
-[triton-dejavu] First execution including JIT compilation took 0.5288493633270264s.
-[triton-dejavu] First execution including JIT compilation took 1.2569010257720947s.
-[triton-dejavu] First execution including JIT compilation took 0.6968162059783936s.
-[triton-dejavu] First execution including JIT compilation took 0.5399911403656006s.
-[triton-dejavu] First execution including JIT compilation took 1.2143075466156006s.
-[triton-dejavu] First execution including JIT compilation took 0.733314037322998s.
-[triton-dejavu] First execution including JIT compilation took 0.6001999378204346s.
-[triton-dejavu] First execution including JIT compilation took 1.1585540771484375s.
-[triton-dejavu] First execution including JIT compilation took 0.7061564922332764s.
-[triton-dejavu] First execution including JIT compilation took 0.509422779083252s.
-[triton-dejavu] First execution including JIT compilation took 1.1820228099822998s.
-[triton-dejavu] First execution including JIT compilation took 0.7445404529571533s.
-[triton-dejavu] First execution including JIT compilation took 0.4977116584777832s.
-[triton-dejavu] First execution including JIT compilation took 1.091106653213501s.
-[triton-dejavu] First execution including JIT compilation took 0.7330291271209717s.
-[triton-dejavu] First execution including JIT compilation took 0.5066168308258057s.
-[triton-dejavu] First execution including JIT compilation took 1.6019270420074463s.
-[triton-dejavu] First execution including JIT compilation took 0.9132928848266602s.
-[triton-dejavu] First execution including JIT compilation took 0.6490397453308105s.
-[triton-dejavu] First execution including JIT compilation took 1.566523790359497s.
-[triton-dejavu] First execution including JIT compilation took 0.9093658924102783s.
-[triton-dejavu] First execution including JIT compilation took 0.5752038955688477s.
-[triton-dejavu] First execution including JIT compilation took 1.3499906063079834s.
-[triton-dejavu] First execution including JIT compilation took 0.7472167015075684s.
-[triton-dejavu] First execution including JIT compilation took 0.5233032703399658s.
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.4235203266143799s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.367872953414917s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3045461177825928s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4933462142944336s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5991702079772949s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3020298480987549s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.43563389778137207s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3850095272064209s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.40093398094177246s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6140017509460449s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.48171281814575195s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5723874568939209s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5938704013824463s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7248561382293701s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5335805416107178s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.632122278213501s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5055139064788818s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.45116615295410156s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6221024990081787s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5203642845153809s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4454641342163086s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5253396034240723s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.413818359375s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.37888503074645996s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5810997486114502s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6574397087097168s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.39346885681152344s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.62447190284729s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.478407621383667s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4812755584716797s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6491189002990723s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5157642364501953s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42938828468322754s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.680762529373169s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5025956630706787s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.43789172172546387s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.662214994430542s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5385723114013672s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42354893684387207s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7007582187652588s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5846168994903564s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.46515345573425293s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9489858150482178s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4519827365875244s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3820490837097168s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6905148029327393s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4977149963378906s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.41359424591064453s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7216389179229736s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5443611145019531s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.44051408767700195s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7794044017791748s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.562237024307251s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5855629444122314s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8041880130767822s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5811121463775635s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4584963321685791s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.0672459602355957s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.586475133895874s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.49823427200317383s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8634672164916992s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6550483703613281s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5157911777496338s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8354160785675049s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5146996974945068s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4071619510650635s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9271225929260254s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.580986738204956s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4576835632324219s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.111537218093872s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6412017345428467s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.49378371238708496s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.0859580039978027s.
-[triton-dejavu] First execution including JIT compilation took 0.6884167194366455s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5569989681243896s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.1242315769195557s.
-[triton-dejavu] First execution including JIT compilation took 0.7313017845153809s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5240278244018555s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.2283635139465332s.
-[triton-dejavu] First execution including JIT compilation took 0.7234997749328613s.
-[triton-dejavu] First execution including JIT compilation took 0.5484554767608643s.
-[triton-dejavu] First execution including JIT compilation took 1.4758551120758057s.
-[triton-dejavu] First execution including JIT compilation took 0.6073637008666992s.
-[triton-dejavu] First execution including JIT compilation took 0.513019323348999s.
-[triton-dejavu] First execution including JIT compilation took 1.0995526313781738s.
-[triton-dejavu] First execution including JIT compilation took 0.6934101581573486s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4188697338104248s.
-[triton-dejavu] First execution including JIT compilation took 1.2958614826202393s.
-[triton-dejavu] First execution including JIT compilation took 0.8936920166015625s.
-[triton-dejavu] First execution including JIT compilation took 0.6156136989593506s.
-[triton-dejavu] First execution including JIT compilation took 1.6940598487854004s.
-[triton-dejavu] First execution including JIT compilation took 0.9992377758026123s.
-[triton-dejavu] First execution including JIT compilation took 0.6807270050048828s.
-[triton-dejavu] First execution including JIT compilation took 1.7915706634521484s.
-[triton-dejavu] First execution including JIT compilation took 0.8958044052124023s.
-[triton-dejavu] First execution including JIT compilation took 0.6192543506622314s.
-[triton-dejavu] First execution including JIT compilation took 1.4820353984832764s.
-[triton-dejavu] First execution including JIT compilation took 0.9619314670562744s.
-[triton-dejavu] First execution including JIT compilation took 0.6009480953216553s.
-bench_cudagraph failed with out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.5543904304504395s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.41063737869262695s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.30236029624938965s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5460262298583984s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.50726318359375s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42197704315185547s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.686154842376709s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5402421951293945s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.44283580780029297s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7077608108520508s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5410175323486328s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.47356486320495605s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6953809261322021s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7652671337127686s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.45247840881347656s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7314624786376953s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5340700149536133s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.47040677070617676s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7279396057128906s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5927844047546387s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5247375965118408s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6629395484924316s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.49787163734436035s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.39687013626098633s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7211334705352783s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5556731224060059s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6893763542175293s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7504291534423828s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5790531635284424s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.47012805938720703s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6832501888275146s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4917154312133789s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.35793185234069824s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.0028676986694335938s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4932436943054199s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3936727046966553s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6478121280670166s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5344371795654297s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3720393180847168s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6856989860534668s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.49400806427001953s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.40635251998901367s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6419022083282471s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42707204818725586s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.37799835205078125s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7089602947235107s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6736738681793213s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3756542205810547s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.805124044418335s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5075352191925049s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3723928928375244s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7105093002319336s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5702188014984131s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.374800443649292s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8257863521575928s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.562946081161499s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42319226264953613s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7553699016571045s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5543286800384521s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4214789867401123s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7838122844696045s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5473670959472656s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.45372581481933594s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7879917621612549s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4837973117828369s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.39473915100097656s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8744144439697266s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6230897903442383s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5225625038146973s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.4133057594299316s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.0481688976287842s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.756004810333252s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.3453631401062012s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6478581428527832s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.45032429695129395s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.1815299987792969s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6545298099517822s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7831099033355713s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.3497538566589355s.
-[triton-dejavu] First execution including JIT compilation took 0.6530613899230957s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.46605396270751953s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.367077350616455s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7068085670471191s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.44535279273986816s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.4023311138153076s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7958266735076904s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.507124662399292s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 2.0259878635406494s.
-[triton-dejavu] First execution including JIT compilation took 0.9261250495910645s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5468614101409912s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 2.3781609535217285s.
-[triton-dejavu] First execution including JIT compilation took 1.0091207027435303s.
-[triton-dejavu] First execution including JIT compilation took 0.6951940059661865s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.7430386543273926s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5558526515960693s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4275531768798828s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8700566291809082s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5754470825195312s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.448347806930542s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.90216064453125s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6065723896026611s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.47638845443725586s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9279463291168213s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7985544204711914s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4977383613586426s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9453699588775635s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4945030212402344s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3977932929992676s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7945261001586914s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.549720287322998s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42061400413513184s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9728484153747559s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5585596561431885s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4591073989868164s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7104089260101318s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4560739994049072s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3498101234436035s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8379316329956055s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5130932331085205s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.36475372314453125s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8153905868530273s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5603029727935791s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.41185498237609863s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8476624488830566s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5228164196014404s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4401214122772217s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8370048999786377s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5649154186248779s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.41320371627807617s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9297363758087158s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7755241394042969s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5734150409698486s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.232421636581421s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8419132232666016s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6011636257171631s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.1239733695983887s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6979858875274658s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.487072229385376s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.2030727863311768s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7380573749542236s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5238943099975586s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.343810796737671s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7683749198913574s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5486259460449219s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.3422448635101318s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7824568748474121s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5656516551971436s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.4012060165405273s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.824357271194458s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6031546592712402s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.4797933101654053s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9887309074401855s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6871368885040283s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.6989936828613281s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9585423469543457s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.692669153213501s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.65909743309021s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6878805160522461s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4942758083343506s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.342179775238037s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7795774936676025s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4817812442779541s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.9305896759033203s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8872191905975342s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4981191158294678s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 2.046060800552368s.
-[triton-dejavu] First execution including JIT compilation took 0.9139325618743896s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5242531299591064s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 2.2648186683654785s.
-[triton-dejavu] First execution including JIT compilation took 0.8834800720214844s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7166852951049805s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.2592248916625977s.
-[triton-dejavu] First execution including JIT compilation took 1.0523405075073242s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.599764347076416s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 2.521293878555298s.
-[triton-dejavu] First execution including JIT compilation took 1.171839952468872s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6534533500671387s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 6.292866468429565s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.8475072383880615s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7422606945037842s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.093348503112793s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6464064121246338s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4449312686920166s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.3426687717437744s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.91444993019104s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5213837623596191s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.3889944553375244s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7329103946685791s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6597933769226074s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.227839469909668s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7042691707611084s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6446876525878906s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.8344454765319824s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9738888740539551s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.672421932220459s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.6915192604064941s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.001119613647461s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7162504196166992s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.8200452327728271s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.0510845184326172s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7335896492004395s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.6177794933319092s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9454030990600586s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5991966724395752s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.6675848960876465s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9363722801208496s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6759653091430664s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.862114667892456s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.0182960033416748s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6502413749694824s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.9105088710784912s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9901387691497803s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6667122840881348s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.7807495594024658s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8490705490112305s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5338022708892822s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.4980332851409912s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.87496018409729s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5721733570098877s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.6702356338500977s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9042339324951172s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6040554046630859s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.504411220550537s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7958929538726807s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5112464427947998s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.6463310718536377s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9492459297180176s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5592634677886963s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 2.21022367477417s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9613430500030518s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5633087158203125s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 2.2821779251098633s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.097722053527832s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6317684650421143s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 3.0794928073883057s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.2995553016662598s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8081183433532715s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 3.323143243789673s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.379629373550415s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7605845928192139s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.796154260635376s.
-[triton-dejavu] First execution including JIT compilation took 1.4310317039489746s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7960169315338135s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 3.4028375148773193s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.6688313484191895s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9481635093688965s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 8.42493486404419s.
-[triton-dejavu] First execution including JIT compilation took 1.7116987705230713s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7699902057647705s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 6.023736238479614s.
-[triton-dejavu] First execution including JIT compilation took 2.0444483757019043s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.062030553817749s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 6.113872051239014s.
-[triton-dejavu] First execution including JIT compilation took 2.0313453674316406s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.0472145080566406s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.48328304290771484s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3348958492279053s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.30452704429626465s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5489785671234131s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.33617687225341797s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5939881801605225s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5159595012664795s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.35559558868408203s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3347048759460449s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5674691200256348s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.43275880813598633s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3386096954345703s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5426011085510254s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.39165472984313965s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.35681700706481934s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5494749546051025s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42157554626464844s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.345064640045166s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5533592700958252s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5270988941192627s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6511619091033936s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5268349647521973s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.611098051071167s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.2967538833618164s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5285255908966064s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3548440933227539s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.31319618225097656s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5639190673828125s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.39958620071411133s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3334677219390869s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6589338779449463s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4241213798522949s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6113383769989014s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8450291156768799s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5650513172149658s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4908101558685303s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7597527503967285s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5363900661468506s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.48531675338745117s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9195475578308105s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5560624599456787s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5028431415557861s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8041844367980957s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.49500370025634766s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.37749719619750977s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 0.5852396488189697s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 0.8381209373474121s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5893561840057373s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3753020763397217s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7169778347015381s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4236640930175781s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3638887405395508s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7483389377593994s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5384445190429688s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.658228874206543s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.886188268661499s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4752342700958252s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7237246036529541s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9241292476654053s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5497918128967285s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4125850200653076s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3035178184509277s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6299910545349121s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4645383358001709s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.442227840423584s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6872811317443848s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5252459049224854s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6195275783538818s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9957168102264404s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5770971775054932s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6193320751190186s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.776587724685669s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5602409839630127s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.470637559890747s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6102380752563477s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.44856834411621094s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.342865228652954s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7176856994628906s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4709174633026123s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 285696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 285696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 285696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 285696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.9359774589538574s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8123137950897217s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4981215000152588s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.2556896209716797s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0426855087280273s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5906248092651367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.9886162281036377s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2194347381591797s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6244046688079834s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 571392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 571392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 571392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 571392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.48250651359558105s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.31485867500305176s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3440537452697754s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8460357189178467s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.36809873580932617s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3085494041442871s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5444772243499756s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38303327560424805s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.34803223609924316s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5283372402191162s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3868238925933838s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.35518574714660645s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5908901691436768s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.41735363006591797s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6766963005065918s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5999925136566162s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.41122961044311523s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3416872024536133s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5752973556518555s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3956427574157715s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3643150329589844s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5298521518707275s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3585391044616699s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3086113929748535s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5575377941131592s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3903212547302246s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3265855312347412s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5826382637023926s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.43185901641845703s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38982224464416504s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7225501537322998s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5456938743591309s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.49631500244140625s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.004322052001953125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4188666343688965s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5133178234100342s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6560304164886475s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.43018031120300293s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4307105541229248s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7024564743041992s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5363326072692871s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.39928627014160156s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6894314289093018s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3780636787414551s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3666982650756836s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8828177452087402s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.47858190536499023s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3928706645965576s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8138139247894287s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.439422607421875s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38376379013061523s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8122892379760742s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.48032069206237793s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.45058488845825195s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.86492919921875s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4958674907684326s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4237086772918701s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8849928379058838s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.486788272857666s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4373905658721924s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0987954139709473s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8294305801391602s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4790806770324707s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1772897243499756s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.521981954574585s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.40645861625671387s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.16839599609375s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5616059303283691s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.42472362518310547s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3248302936553955s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6032726764678955s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4481019973754883s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.395122766494751s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7378954887390137s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5323140621185303s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7057254314422607s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6767878532409668s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5054512023925781s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7137601375579834s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 234496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 234496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 234496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 234496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 318464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 318464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 318464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 318464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.6594467163085938s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3652985095977783s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5937278270721436s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.922431230545044s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2858715057373047s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5902688503265381s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.350640296936035s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5020318031311035s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6842968463897705s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 385024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 385024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 385024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 385024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 468992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 468992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 468992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 468992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 620544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 620544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 636928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 636928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 636928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 636928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.7688605785369873s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3665287494659424s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3449244499206543s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6777553558349609s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4027137756347656s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38666725158691406s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7940988540649414s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4141719341278076s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.37494373321533203s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7313904762268066s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.42134833335876465s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3692958354949951s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7075221538543701s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.43701624870300293s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.42354393005371094s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8378398418426514s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4932551383972168s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4718587398529053s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8113245964050293s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4463827610015869s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.428286075592041s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6905107498168945s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4112887382507324s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.33467864990234375s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7086637020111084s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.45603132247924805s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7833783626556396s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9827532768249512s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5891172885894775s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.49228405952453613s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.0715341567993164s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6479880809783936s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5540139675140381s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0900757312774658s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8101351261138916s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.526641845703125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1183600425720215s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6459150314331055s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5361812114715576s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.203599214553833s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6816227436065674s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5723059177398682s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2995352745056152s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.441802978515625s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.36841726303100586s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1730992794036865s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6147010326385498s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.48934268951416016s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2903673648834229s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5355379581451416s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.42858266830444336s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2931723594665527s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5596542358398438s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4497413635253906s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2131459712982178s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5678653717041016s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4486379623413086s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3713254928588867s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5958590507507324s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5505542755126953s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4308266639709473s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7147002220153809s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5467684268951416s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3634920120239258s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5880486965179443s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5700411796569824s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4810731410980225s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6438288688659668s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4656994342803955s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6747398376464844s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7002549171447754s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5245516300201416s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.000950813293457s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9302749633789062s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.604921817779541s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.449615478515625s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 267264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 267264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 283648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 283648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 367616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 367616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 3.207704782485962s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2239928245544434s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6375505924224854s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.287391424179077s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2227861881256104s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6965057849884033s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 434176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 434176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 466944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 466944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 466944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 466944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 534528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 534528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 567296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 567296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 567296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 567296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 735232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 735232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 768000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 768000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 768000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 768000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.9831523895263672s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5336413383483887s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.46745753288269043s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0242087841033936s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6498258113861084s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6161227226257324s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0387804508209229s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5843358039855957s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.489365816116333s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.084639549255371s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6798868179321289s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6222915649414062s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.24820876121521s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6479372978210449s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5105531215667725s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.145951509475708s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6538543701171875s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5761466026306152s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4845623970031738s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6651618480682373s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.55460524559021s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.081782341003418s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5795166492462158s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4992384910583496s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.14192533493042s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6035377979278564s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7185189723968506s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2579710483551025s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6511590480804443s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.507249116897583s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2077677249908447s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.003390789031982422s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5164101123809814s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2660481929779053s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6785335540771484s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5961654186248779s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4194557666778564s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7268083095550537s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5461311340332031s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3952383995056152s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7429358959197998s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9075326919555664s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3474931716918945s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6649174690246582s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5712227821350098s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4214322566986084s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9512057304382324s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6910531520843506s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6161599159240723s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7413544654846191s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7766103744506836s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6750471591949463s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.003296375274658203s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6038086414337158s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8977270126342773s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9200453758239746s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7148220539093018s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.773270845413208s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8617911338806152s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6090264320373535s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 241152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 241152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 257536, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257536, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 257536, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257536, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.089289903640747s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8955142498016357s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7933282852172852s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6730310916900635s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.027360200881958s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6189002990722656s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.377192258834839s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0640830993652344s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6758365631103516s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.402773380279541s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 381952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 381952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 381952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 381952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 482304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 482304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 515072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 515072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 515072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 515072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.78285551071167s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.790651559829712s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8154182434082031s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.278247594833374s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8989050388336182s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1762864589691162s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 432128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 432128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 698368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 698368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 763904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 763904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 763904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 763904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 964608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 964608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1030144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1030144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1030144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1030144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.577185869216919s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2491505146026611s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7602677345275879s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.791684865951538s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2859153747558594s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7977027893066406s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6780614852905273s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3233978748321533s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8441951274871826s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.740365743637085s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2907094955444336s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.177889347076416s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.745009422302246s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3827052116394043s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9290802478790283s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.8287532329559326s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3876776695251465s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8190820217132568s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.0606014728546143s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3506171703338623s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8591070175170898s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.745933771133423s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3523740768432617s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8213198184967041s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.9839930534362793s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.496906042098999s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8184218406677246s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.0979418754577637s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5288279056549072s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8494882583618164s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.252285957336426s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.004141569137573242s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8563632965087891s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.2991631031036377s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5022201538085938s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8538022041320801s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.498495578765869s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5448570251464844s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8523283004760742s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.52825927734375s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6031606197357178s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0104546546936035s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.137936592102051s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.556880235671997s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.949892520904541s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.461966037750244s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5957205295562744s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9467792510986328s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.070852518081665s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6498074531555176s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9917776584625244s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.3239054679870605s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7489638328552246s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9379489421844482s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.467419147491455s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 239616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 239616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 239616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 239616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 256512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 289280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 289280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 289280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 289280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 388608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 388608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 388608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 388608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.755504131317139s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.098724842071533s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0180997848510742s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.3917906284332275s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.3101797103881836s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0687329769134521s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 9.263354539871216s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 224.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 224.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 314368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 314368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 379904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 379904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 379904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 379904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 479232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 479232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 479232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 479232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 513024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 513024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 578560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 578560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 578560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 578560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 777216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 777216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 777216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 777216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 10.830562353134155s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.693915367126465s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4908199310302734s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 10.97849154472351s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 628736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 628736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 759808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 759808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 759808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 759808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 958464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 958464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 958464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 958464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1026048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1026048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1157120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1157120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1157120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1157120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1423360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1423360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1554432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1554432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1554432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1554432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.6892292499542236s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38911986351013184s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.30687904357910156s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6879351139068604s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.42769932746887207s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.329437255859375s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7260704040527344s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4387474060058594s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.39746975898742676s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7281160354614258s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5435876846313477s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.37840747833251953s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8864538669586182s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5189304351806641s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3825032711029053s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.960721492767334s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.47132205963134766s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.39466094970703125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9138970375061035s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5223879814147949s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3736448287963867s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7308955192565918s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.44620800018310547s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.33580613136291504s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8977346420288086s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7137322425842285s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.34291768074035645s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8276610374450684s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5064258575439453s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3502078056335449s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.067542552947998s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5365092754364014s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3644242286682129s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8512518405914307s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5115513801574707s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3718876838684082s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.920119047164917s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5264711380004883s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4116835594177246s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9466478824615479s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.566298246383667s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.43740200996398926s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1071062088012695s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.526606559753418s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.45633745193481445s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.177720069885254s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.768721342086792s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4274454116821289s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4301061630249023s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6415538787841797s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.42043113708496094s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3885080814361572s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6621842384338379s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4451918601989746s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3652503490447998s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7669777870178223s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4610159397125244s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5304932594299316s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7010109424591064s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5196688175201416s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.7303383350372314s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1459696292877197s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.44887781143188477s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.7154202461242676s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9417815208435059s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5006546974182129s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.349583625793457s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0592787265777588s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5471467971801758s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 319488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 319488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 319488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 319488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 386048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 386048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 390144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 390144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 390144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 390144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.510880470275879s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.307586193084717s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8460187911987305s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.805698871612549s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.5006825923919678s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8813536167144775s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 356352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 356352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 356352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 356352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 638976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 638976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 638976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 638976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 772096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 772096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 780288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 780288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 780288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 780288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.8435537815093994s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4684276580810547s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3505737781524658s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8157010078430176s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5323681831359863s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.37955546379089355s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0491282939910889s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6324632167816162s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3687443733215332s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9693076610565186s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5698964595794678s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3987698554992676s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.994401216506958s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5128960609436035s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.39620423316955566s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8957595825195312s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.558398962020874s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38295984268188477s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9372365474700928s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5337975025177002s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4269568920135498s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9156548976898193s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.49073123931884766s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3529493808746338s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9221329689025879s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5333900451660156s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38635730743408203s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1536855697631836s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5275108814239502s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.41078877449035645s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0259864330291748s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.0027832984924316406s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.42501282691955566s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0600135326385498s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6524257659912109s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.40082621574401855s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0696709156036377s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6272509098052979s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4128298759460449s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1351423263549805s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.889866828918457s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4912388324737549s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3318710327148438s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5907411575317383s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.41211819648742676s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3889124393463135s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8929169178009033s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.46093177795410156s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5692577362060547s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7735788822174072s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.47498464584350586s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6411559581756592s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7738308906555176s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6725783348083496s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7961504459381104s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9248149394989014s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5011544227600098s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7031898498535156s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0630671977996826s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5364012718200684s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.6309447288513184s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8956303596496582s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.48720335960388184s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.0003604888916016s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9654979705810547s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6053953170776367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.7849512100219727s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1027169227600098s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6578397750854492s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 406528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 406528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 556032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 556032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 6.886560678482056s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.371800422668457s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1015229225158691s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.3806397914886475s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.614715576171875s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.013362169265747s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 813056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 813056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1112064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1112064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.0904018878936768s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5664336681365967s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3864610195159912s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4100468158721924s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.591252326965332s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3967752456665039s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1207458972930908s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6113357543945312s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4384629726409912s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3609027862548828s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6447341442108154s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4421412944793701s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.172593355178833s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6391100883483887s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.43094921112060547s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3330214023590088s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6399593353271484s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4591398239135742s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2691125869750977s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7364680767059326s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.45516157150268555s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.197962999343872s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6473965644836426s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38663530349731445s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4497623443603516s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6786634922027588s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.45200419425964355s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3861651420593262s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7120561599731445s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.43462252616882324s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4068715572357178s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.0032808780670166016s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.46666932106018066s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.662893533706665s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7151412963867188s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.47615790367126465s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4645192623138428s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7670283317565918s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5515899658203125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5568029880523682s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7833657264709473s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6841933727264404s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7808070182800293s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8353831768035889s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5087378025054932s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8177378177642822s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9224264621734619s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7038931846618652s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.1916353702545166s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0954580307006836s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5064713954925537s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.0878994464874268s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.0031926631927490234s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.52482008934021s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.190568447113037s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.029674768447876s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5371918678283691s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.211254835128784s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9992356300354004s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5568861961364746s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 306688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 306688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 314880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 314880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 314880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 314880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 3.5550520420074463s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1517488956451416s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5613963603973389s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.608468770980835s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3026208877563477s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6365609169006348s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.679625034332275s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3380742073059082s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7074956893920898s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 463872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 463872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 463872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 463872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 613376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 613376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 629760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 629760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 629760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 629760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 8.24333930015564s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6616246700286865s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9932739734649658s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.838382720947266s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 729088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 729088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 761856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 761856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 761856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 761856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 927744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 927744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 927744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 927744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1226752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1226752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1259520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1259520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1259520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1259520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.333728551864624s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9741692543029785s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5558607578277588s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.7010092735290527s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0556650161743164s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5470688343048096s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.450766086578369s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0898175239562988s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8188917636871338s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6552906036376953s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0663738250732422s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6016709804534912s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.5987627506256104s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0263035297393799s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6767642498016357s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.7210144996643066s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0204486846923828s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.614182710647583s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.7602884769439697s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0571949481964111s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6267237663269043s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.647933006286621s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9846975803375244s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5680670738220215s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.754499912261963s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0354676246643066s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.588914155960083s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.931586742401123s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1365456581115723s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6373190879821777s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.8904852867126465s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.14112305641174316s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5947198867797852s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.019486665725708s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1140179634094238s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6876914501190186s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.175428867340088s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.112687349319458s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8204576969146729s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.2454752922058105s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2166831493377686s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6668229103088379s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.3426685333251953s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6171464920043945s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.639662504196167s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.8239479064941406s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.258443832397461s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6690225601196289s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.081692457199097s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.307067632675171s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6953957080841064s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.123711824417114s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7064650058746338s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8023972511291504s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.322245359420776s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.376969575881958s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7755577564239502s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 364032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 6.028242111206055s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8080382347106934s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8130929470062256s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.314016580581665s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8047997951507568s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8820269107818604s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.452376842498779s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 363520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 363520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 363520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 363520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 529408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 529408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 562176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 562176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 562176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 562176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 728064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 728064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 760832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 760832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 760832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 760832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 11.448482990264893s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.280648231506348s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4616827964782715s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 727040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 727040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 727040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 727040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 860160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 860160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 925696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 925696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 925696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 925696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1058816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1058816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1124352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1124352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1124352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1124352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1456128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1456128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1521664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1521664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1521664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1521664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.428146839141846s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.640364408493042s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1616089344024658s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.564483642578125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6187920570373535s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2367215156555176s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.862403154373169s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.5825343132019043s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.245880126953125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.880247354507446s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.4725282192230225s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2873585224151611s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.849554061889648s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6708860397338867s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.281620740890503s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.745583772659302s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.5308899879455566s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.291445016860962s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.3013856410980225s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6642332077026367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5786309242248535s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.098564147949219s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.9687094688415527s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5861866474151611s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.518315076828003s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.761479139328003s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5685019493103027s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.746150255203247s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.955209970474243s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.9200007915496826s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.877080917358398s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.667490243911743s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3610637187957764s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.141212463378906s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.017885208129883s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4287219047546387s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.0148937702178955s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.1420817375183105s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4417452812194824s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 239360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 239360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 7.719076871871948s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.1883902549743652s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5184333324432373s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.484629154205322s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.5938735008239746s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5487051010131836s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.891971349716187s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.8961668014526367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.0048236846923828125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 9.257888555526733s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 346624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 346624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 379392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 379392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 379392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 379392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 478720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 478720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 12.265568017959595s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.9171974658966064s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.9982192516326904s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 12.480285167694092s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.878373861312866s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.379201889038086s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 429056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 429056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 693248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 693248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 758784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 758784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 758784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 758784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 957440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 957440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 11.809521436691284s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.696657657623291s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 858112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 858112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1122304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1122304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1386496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1386496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1517568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1517568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1517568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1517568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1914880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1914880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.3908696174621582s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6094310283660889s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3785121440887451s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4608709812164307s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6619534492492676s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4664590358734131s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5869903564453125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6629056930541992s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4497408866882324s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4750583171844482s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6932578086853027s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.46971654891967773s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5020930767059326s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6992261409759521s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4776594638824463s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5826818943023682s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7325599193572998s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5596516132354736s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5360822677612305s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7482321262359619s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.47079968452453613s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6244652271270752s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6642742156982422s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.46008729934692383s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7974128723144531s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7657811641693115s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.482452392578125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7750086784362793s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.993119478225708s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.46810221672058105s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8732283115386963s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8175673484802246s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5403792858123779s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8839638233184814s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9344329833984375s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4993572235107422s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.9738190174102783s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.921602725982666s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5270700454711914s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 254720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 254720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.9389686584472656s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9776608943939209s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.721153974533081s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.328566074371338s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1687307357788086s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7047884464263916s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.064958572387695s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2405052185058594s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7056465148925781s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 237056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 237056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 239104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 239104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 239104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 239104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 373248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 373248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 375296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 375296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 375296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 375296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 509440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 509440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.935272693634033s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.545255661010742s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8536858558654785s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.657502174377441s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.3809196949005127s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9746437072753906s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 474112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 474112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 478208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 478208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 478208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 478208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 610304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 610304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 614400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 614400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 614400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 614400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 746496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 746496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 750592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 750592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 750592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 750592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1018880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1018880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 22.241742372512817s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.0289146900177s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.4155397415161133s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 684032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 684032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 684032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 684032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 948224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 948224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 956416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 956416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 956416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 956416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1220608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1220608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1228800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1228800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1228800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1228800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1492992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1492992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1501184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1501184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1501184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1501184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2037760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2037760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.6654775142669678s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6987285614013672s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.46905040740966797s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6758484840393066s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8101885318756104s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7105352878570557s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8995463848114014s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8054959774017334s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.47978901863098145s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7944765090942383s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8776211738586426s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5182280540466309s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7438545227050781s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.814023494720459s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5291764736175537s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.2716925144195557s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8169729709625244s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.502709150314331s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8031582832336426s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0514512062072754s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5342199802398682s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.9495587348937988s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8109526634216309s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5593302249908447s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.098154067993164s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8938405513763428s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5223357677459717s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.4524009227752686s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9562726020812988s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8658204078674316s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.5287926197052s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0612642765045166s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6138486862182617s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.1424713134765625s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.024322271347046s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5856528282165527s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.351151943206787s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1730172634124756s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6047327518463135s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 261888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 3.2446868419647217s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1245031356811523s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5670485496520996s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.817992925643921s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.297480583190918s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8131206035614014s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.6364405155181885s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.439967393875122s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7062406539916992s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 243200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 383488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 383488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 387584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 387584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 387584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 387584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 523776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 523776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 6.749169826507568s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6596052646636963s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9074513912200928s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.860004186630249s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.4395840167999268s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0885822772979736s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 346112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 346112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 486400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 486400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 634880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 634880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 634880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 634880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 766976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 766976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 775168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 775168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 775168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 775168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1047552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1047552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 24.700168132781982s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.437432765960693s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.4199228286743164s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 692224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 692224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 972800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 972800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1269760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1269760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1269760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1269760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1533952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1533952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1550336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1550336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1550336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1550336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2095104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2095104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 3.121896266937256s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0428099632263184s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6234233379364014s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.9806389808654785s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2036631107330322s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6355829238891602s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.0641424655914307s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1515545845031738s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.646599292755127s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.120051383972168s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.219144582748413s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6336996555328369s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.227719306945801s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2983787059783936s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.648597002029419s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.2070751190185547s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1535370349884033s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7411158084869385s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.3133440017700195s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2201085090637207s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6529905796051025s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.082692861557007s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2449371814727783s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5935671329498291s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.3032917976379395s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5225858688354492s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6467459201812744s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.437588930130005s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2381985187530518s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6682257652282715s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.7129526138305664s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2800922393798828s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6987648010253906s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.6680898666381836s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3012008666992188s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.779672384262085s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.7857558727264404s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3570668697357178s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8044769763946533s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 276224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 276224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.762149810791016s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5667040348052979s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7456066608428955s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.477536916732788s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7072887420654297s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8421585559844971s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.106162071228027s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7843899726867676s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9055137634277344s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 255488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 255488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 412160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 412160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 412160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 412160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 552448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 552448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 560640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 560640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 560640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 560640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 8.529049634933472s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.0771217346191406s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1195790767669678s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 11.513381719589233s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.836657762527466s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.298607587814331s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 510976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 510976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 824320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 824320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 824320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 824320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1104896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1104896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1121280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1121280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1121280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1121280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 30.039280891418457s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.3039727210998535s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.613384246826172s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1021952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1021952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1351680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1351680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1351680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1351680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1615872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1615872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1648640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1648640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1648640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1648640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2209792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2209792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2242560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2242560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2242560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2242560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 6.6451661586761475s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.0081593990325928s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9920327663421631s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.377732992172241s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.00347900390625s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1749897003173828s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.304524898529053s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.9663889408111572s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9999239444732666s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.501835346221924s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.0289182662963867s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2674453258514404s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.546030521392822s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.1539206504821777s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0315632820129395s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.50945520401001s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.068497657775879s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1107735633850098s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.639242649078369s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.118880033493042s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1335670948028564s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.542043685913086s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.1045868396759033s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0928034782409668s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.052458047866821s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.2213785648345947s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.07881498336792s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.08164668083191s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.2511954307556152s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1090149879455566s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.324692964553833s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.2822751998901367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1038963794708252s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.203450679779053s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.3637242317199707s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.166776418685913s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.533335447311401s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.439464569091797s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1518568992614746s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 304896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 304896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 313088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 313088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 313088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 313088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 9.856246948242188s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.7604939937591553s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2217411994934082s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 10.088243961334229s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.0190610885620117s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.248652458190918s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 10.579221248626709s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.1951639652252197s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2641730308532715s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 444928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 444928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 461312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 461312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 461312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 461312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 609792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 609792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 626176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 626176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 626176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 626176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 4.474817276000977s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7217485904693604s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 560128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 560128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 592896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 592896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 592896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 592896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 889856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 889856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 922624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 922624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 922624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 922624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1219584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1219584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1252352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1252352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1252352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1252352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 9.443265199661255s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.8229305744171143s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1120256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1120256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1185792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1185792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1185792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1185792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1449984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1449984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1515520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1515520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1515520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1515520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1779712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1779712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1845248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1845248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1845248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1845248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2439168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2439168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2504704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2504704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2504704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2504704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 5.715268850326538s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.512333869934082s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 5.526895999908447s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.5083181858062744s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 279808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 279808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 279808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 279808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 362240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 361984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 361984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 361984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 361984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 559616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 559616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 559616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 559616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 724480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 724480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 757248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 757248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 328704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 328704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 328704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 328704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 658432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 658432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 723968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 723968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 723968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 723968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1119232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1119232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1119232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1119232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1448960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1448960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1514496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1514496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1514496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1514496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 657408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 657408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 657408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 657408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1052672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1052672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1052672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1052672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1316864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1316864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1447936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1447936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1447936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1447936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1712128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1712128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1843200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1843200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1843200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1843200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2107392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2107392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2238464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2238464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2238464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2238464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2897920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2897920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 3028992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 3028992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 3028992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 3028992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] added BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None for _chunk_scan_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default and key ('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')
-[2025-07-23 21:36:10] Triton autotuning for function _chunk_scan_fwd_kernel finished after 15278.82s; best config selected: BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None with benchmark time 0.014237518422305584;  evaluated 2625 configurations;
-[triton-dejavu] ('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16') not in cache, starting to tune...
-[triton-dejavu] [2025-07-23 21:36:10]  Started benchmarking of 2625 configurations... (use_bo: False, run: 0)
-[triton-dejavu] First execution including JIT compilation took 0.19137167930603027s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.19248533248901367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.18099021911621094s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.20834088325500488s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.1988391876220703s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.2113637924194336s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.216780424118042s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.20966219902038574s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4921605587005615s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.2258141040802002s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.22273588180541992s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.21141862869262695s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.265488862991333s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.22527718544006348s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.1997981071472168s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.29380369186401367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.26201629638671875s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.20731806755065918s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.2723116874694824s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.27080583572387695s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.23759222030639648s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.1978166103363037s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.18923354148864746s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.21349525451660156s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.2810091972351074s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.22581052780151367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.35887718200683594s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.2530679702758789s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.24747061729431152s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.24676847457885742s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.29694175720214844s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.25411462783813477s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.2558891773223877s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.25450968742370605s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.26735782623291016s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.29147815704345703s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.2686631679534912s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.31691837310791016s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 0.2369706630706787s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 277504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 277504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 418816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 418816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 560128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 560128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 242688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 242688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 326656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 326656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 261632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 289792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 289792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 439296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 439296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 253952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 253952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 407552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 407552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 407552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 407552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 236544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 236544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 269312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 269312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 269312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 269312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 270336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 270336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 473088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 473088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 538624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 538624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 538624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 538624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 351232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 351232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 401408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 401408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 702464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 702464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 290304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 290304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 580608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 580608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1161216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1161216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 302080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 302080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 376832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 376832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 376832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 376832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 604160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 604160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 604160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 604160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 334848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 334848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 334848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 334848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 669696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 669696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 669696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 669696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 351232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 351232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 401408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 401408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 702464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 702464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 232960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 232960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 332800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 332800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 398336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 398336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 465920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 465920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 532480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 532480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 665600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 665600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 796672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 796672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 796672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 796672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 931840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 931840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 347648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 347648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 695296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 695296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 794624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 794624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1390592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1390592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 352256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 352256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 352256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 352256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 704512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 704512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 704512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 704512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 261632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 298496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 298496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 372736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 372736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 372736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 372736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 596992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 596992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 596992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 596992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 446464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 446464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 446464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 446464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 745472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 745472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 745472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 745472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1193984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1193984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1193984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1193984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 290304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 290304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 580608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 580608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1161216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1161216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 347648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 347648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 695296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 695296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 794624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 794624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1390592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1390592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 462336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 462336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 660480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 660480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 791552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 791552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 791552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 791552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 924672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 924672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1056768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1056768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1320960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1320960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1583104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1583104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1583104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1583104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1849344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1849344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] added BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 16, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None for _chunk_state_varlen_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default and key ('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16')
-[2025-07-24 03:00:55] Triton autotuning for function _chunk_state_varlen_kernel finished after 19485.39s; best config selected: BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 16, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None with benchmark time nan;  evaluated 2625 configurations;
-ERROR 07-24 03:00:55 [dump_input.py:69] Dumping input data for V1 LLM engine (v0.1.dev7919+g84c7525) with config: model='/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf', speculative_config=None, tokenizer='/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=132096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"/home/zrlngl/.cache/vllm/torch_compile_cache/9bcd1b9f98","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":512,"local_cache_dir":"/home/zrlngl/.cache/vllm/torch_compile_cache/9bcd1b9f98/rank_0_0/backbone"}, 
-ERROR 07-24 03:00:55 [dump_input.py:76] Dumping scheduler output for model execution: SchedulerOutput(scheduled_new_reqs=[NewRequestData(req_id=0,prompt_token_ids_len=64,mm_inputs=[],mm_hashes=[],mm_positions=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=True, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None),block_ids=([1], [2], [3], [4], [5], [6], [7], [8], [9], [10]),num_computed_tokens=0,lora_request=None)], scheduled_cached_reqs=CachedRequestData(req_ids=[], resumed_from_preemption=[], new_token_ids=[], new_block_ids=[], num_computed_tokens=[]), num_scheduled_tokens={0: 64}, total_num_scheduled_tokens=64, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, num_common_prefix_blocks=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], finished_req_ids=[], free_encoder_input_ids=[], structured_output_request_ids={}, grammar_bitmask=null, kv_connector_metadata=null)
-ERROR 07-24 03:00:55 [dump_input.py:79] Dumping scheduler stats: SchedulerStats(num_running_reqs=1, num_waiting_reqs=0, kv_cache_usage=0.009856630824372714, prefix_cache_stats=PrefixCacheStats(reset=False, requests=0, queries=0, hits=0), spec_decoding_stats=None, num_corrupted_reqs=0)
-ERROR 07-24 03:00:55 [core.py:615] EngineCore encountered a fatal error.
-ERROR 07-24 03:00:55 [core.py:615] Traceback (most recent call last):
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 606, in run_engine_core
-ERROR 07-24 03:00:55 [core.py:615]     engine_core.run_busy_loop()
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 633, in run_busy_loop
-ERROR 07-24 03:00:55 [core.py:615]     self._process_engine_step()
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 658, in _process_engine_step
-ERROR 07-24 03:00:55 [core.py:615]     outputs, model_executed = self.step_fn()
-ERROR 07-24 03:00:55 [core.py:615]                               ^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 262, in step
-ERROR 07-24 03:00:55 [core.py:615]     model_output = self.execute_model(scheduler_output)
-ERROR 07-24 03:00:55 [core.py:615]                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 248, in execute_model
-ERROR 07-24 03:00:55 [core.py:615]     raise err
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 239, in execute_model
-ERROR 07-24 03:00:55 [core.py:615]     return self.model_executor.execute_model(scheduler_output)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/executor/abstract.py", line 87, in execute_model
-ERROR 07-24 03:00:55 [core.py:615]     output = self.collective_rpc("execute_model",
-ERROR 07-24 03:00:55 [core.py:615]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/executor/uniproc_executor.py", line 58, in collective_rpc
-ERROR 07-24 03:00:55 [core.py:615]     answer = run_method(self.driver_worker, method, args, kwargs)
-ERROR 07-24 03:00:55 [core.py:615]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/utils/__init__.py", line 2990, in run_method
-ERROR 07-24 03:00:55 [core.py:615]     return func(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
-ERROR 07-24 03:00:55 [core.py:615]     return func(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/worker/gpu_worker.py", line 327, in execute_model
-ERROR 07-24 03:00:55 [core.py:615]     output = self.model_runner.execute_model(scheduler_output,
-ERROR 07-24 03:00:55 [core.py:615]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
-ERROR 07-24 03:00:55 [core.py:615]     return func(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/worker/gpu_model_runner.py", line 1404, in execute_model
-ERROR 07-24 03:00:55 [core.py:615]     model_output = self.model(
-ERROR 07-24 03:00:55 [core.py:615]                    ^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return self._call_impl(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return forward_call(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/model_executor/models/granitemoehybrid.py", line 634, in forward
-ERROR 07-24 03:00:55 [core.py:615]     hidden_states = self.model(input_ids, positions, mamba_cache_params,
-ERROR 07-24 03:00:55 [core.py:615]                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/compilation/decorators.py", line 246, in __call__
-ERROR 07-24 03:00:55 [core.py:615]     model_output = self.forward(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/model_executor/models/granitemoehybrid.py", line 358, in forward
-ERROR 07-24 03:00:55 [core.py:615]     def forward(
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return self._call_impl(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return forward_call(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 838, in _fn
-ERROR 07-24 03:00:55 [core.py:615]     return fn(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 830, in call_wrapped
-ERROR 07-24 03:00:55 [core.py:615]     return self._wrapped_call(self, *args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 406, in __call__
-ERROR 07-24 03:00:55 [core.py:615]     raise e
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 393, in __call__
-ERROR 07-24 03:00:55 [core.py:615]     return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return self._call_impl(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return forward_call(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "<eval_with_key>.82", line 220, in forward
-ERROR 07-24 03:00:55 [core.py:615]     submod_1 = self.submod_1(getitem, s0, getitem_1);  getitem = submod_1 = None
-ERROR 07-24 03:00:55 [core.py:615]                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 830, in call_wrapped
-ERROR 07-24 03:00:55 [core.py:615]     return self._wrapped_call(self, *args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 406, in __call__
-ERROR 07-24 03:00:55 [core.py:615]     raise e
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 393, in __call__
-ERROR 07-24 03:00:55 [core.py:615]     return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return self._call_impl(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return forward_call(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "<eval_with_key>.2", line 5, in forward
-ERROR 07-24 03:00:55 [core.py:615]     mamba_mixer2 = torch.ops.vllm.mamba_mixer2(x_3, output, 'model.layers.0.mixer', None);  x_3 = output = mamba_mixer2 = None
-ERROR 07-24 03:00:55 [core.py:615]                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/_ops.py", line 1158, in __call__
-ERROR 07-24 03:00:55 [core.py:615]     return self._op(*args, **(kwargs or {}))
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/model_executor/layers/mamba/mamba_mixer2.py", line 749, in mamba_mixer2
-ERROR 07-24 03:00:55 [core.py:615]     self.forward_cuda(hidden_states=hidden_states,
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/model_executor/layers/mamba/mamba_mixer2.py", line 718, in forward_cuda
-ERROR 07-24 03:00:55 [core.py:615]     hidden_states = torch.vstack(ssd_output_list)
-ERROR 07-24 03:00:55 [core.py:615]                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615] torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.66 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.95 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-[(1, 64, 1), (1, 128, 1), (1, 512, 1), (1, 1024, 1), (1, 2048, 1), (1, 4096, 1)]
-====== Measuring batch_size 1, input length 64, output length 1 =====
-VLLM_USE_V1=1 python vllm-triton-backend/vllm/benchmarks/benchmark_latency.py --model /net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf --input-len 64 --output-len 1 --batch-size 1 --output-json /home/zrlngl/watsonx/zrl-triton-results-and-notebooks/vllm_benchmarks_latency/-net-storage149-autofs-css22-nmg-models-cos-1bfc857-fmaas-integration-tests-models-granite-4_0-small-base-pipecleaner-hf/NVIDIA_H100_80GB_HBM3/tuning_ignore/exp_2025-07-23_1140//result_bs_1_il_64_ol_1.json --num-iters-warmup 3 --num-iters 3 --tensor-parallel 1 
-benchmark command returned 256, stopping...
diff --git a/vllm b/vllm
index 8ba5e3324..f0c503f66 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit 8ba5e3324c93ea4c2b791676baa93838dbe0ca9e
+Subproject commit f0c503f66e2f6aafa966318d488fd92ac662cdf0

From b05e851e7f96be6538c79cf0bab1d4ef80e7ba49 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 3 Sep 2025 07:56:23 -0400
Subject: [PATCH 60/61] some cleanup

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 ...256,device_name=NVIDIA_H100_80GB_HBM3.json |    146 -
 ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json |    146 -
 ...512,device_name=NVIDIA_H100_80GB_HBM3.json |    146 -
 ...384,device_name=NVIDIA_H100_80GB_HBM3.json |    146 -
 ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json |    146 -
 ...768,device_name=NVIDIA_H100_80GB_HBM3.json |    146 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |    110 -
 .../default/cache.json                        |    110 -
 .../default/cache.json                        |     98 -
 .../default/cache.json                        |     26 -
 .../default/cache.json                        |     25 -
 .../default/cache.json                        |     31 -
 .../default/cache.json                        |     26 -
 .../default/cache.json                        |     30 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     27 -
 .../default/cache.json                        |     28 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |    347 -
 .../default/cache.json                        |    387 -
 .../default/cache.json                        |    347 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |    347 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     27 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     36 -
 .../default/cache.json                        |    347 -
 .../default/cache.json                        |    387 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |    347 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     32 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     27 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     36 -
 .../default/cache.json                        |     36 -
 .../default/cache.json                        |     36 -
 .../default/cache.json                        |    387 -
 .../default/cache.json                        |     36 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     36 -
 .../default/cache.json                        |     35 -
 .../default/cache.json                        |     35 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     32 -
 .../default/cache.json                        |     32 -
 .../default/cache.json                        |     32 -
 .../default/cache.json                        |     32 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     32 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     27 -
 .../default/cache.json                        |     27 -
 .../default/cache.json                        |     27 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |     27 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 .../default/cache.json                        |      8 -
 triton-dejavu                                 |      2 +-
 tune_log_g4small.txt                          |  14544 --
 tuning_0.log                                  | 113327 ---------------
 vllm                                          |      2 +-
 85 files changed, 2 insertions(+), 133092 deletions(-)
 delete mode 100644 E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json
 delete mode 100644 E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
 delete mode 100644 E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
 delete mode 100644 E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
 delete mode 100644 E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
 delete mode 100644 E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
 delete mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
 delete mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
 delete mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_A100-SXM4-80GB/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/rocm_6.3.1/gpu_AMD_Instinct_MI250X_MI250/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-c01d6c3dfb6d587c5fb5a1edbe6d606a9804204c3305d997bb82640bf3e80282/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_6.3.1/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-90178d0ab8e71db9cd16710d562763dd010643f28cd21980d5064c3ab782ecaa/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-48e3cb6cd6592d4b55826bce9ff39781f5f8d3beec28e171da3dd4e5109ad732/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-96fc3b4e585fc8cfcb4fcdd974640839b5a5889cf4f54dbf57ad6a3439b671d0/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f52792779faa0af779cada63f2df14c185a5b34f253646e36c07bb8926f93dc8/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-88d41f86261407aa0eaf355d2d650ddaee68bdf62e28c6cc74f4e1bcacddcfd8/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3a6fc1c46225b2f7d0bc848adf5344e3dda28dcbb0957584ee22138ce6625218/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-52c92ceef6d420c78c5c5940c8b38fe551467bdabe0ca1810415fbe039359610/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-5c087adac96d09b2060f573486a99205cda08f58e544b9acfd14918832e2e582/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-72dc6d55a572ac899f3da4b41257cc6aeb8cad69a0fc94b16aa73ca9c82b4012/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
 delete mode 100755 ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-f130aa2e7a5258b0e95f6494e2db37f5dea3ccbb97ee8feed09d2d36599bff88/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-071e784de56797ed9764ebe722a0ebf6c8c9719610c15e34a8b3a8f9fe7252ae/default/cache.json
 delete mode 100644 tune_log_g4small.txt
 delete mode 100644 tuning_0.log

diff --git a/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json b/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json
deleted file mode 100644
index 147a83660..000000000
--- a/E=62,N=256,device_name=NVIDIA_H100_80GB_HBM3.json
+++ /dev/null
@@ -1,146 +0,0 @@
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "128": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "256": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    }
-}
diff --git a/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
deleted file mode 100644
index ac556d936..000000000
--- a/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+++ /dev/null
@@ -1,146 +0,0 @@
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "96": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "128": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
-        "num_stages": 4
-    }
-}
diff --git a/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json b/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
deleted file mode 100644
index a01e9c317..000000000
--- a/E=62,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
+++ /dev/null
@@ -1,146 +0,0 @@
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "128": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "256": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    }
-}
diff --git a/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json b/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
deleted file mode 100644
index a7cfd175d..000000000
--- a/E=72,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
+++ /dev/null
@@ -1,146 +0,0 @@
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "128": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "512": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    }
-}
diff --git a/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
deleted file mode 100644
index 79fe4dbe7..000000000
--- a/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
+++ /dev/null
@@ -1,146 +0,0 @@
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "96": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 3
-    }
-}
diff --git a/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json b/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
deleted file mode 100644
index 3caae02cb..000000000
--- a/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
+++ /dev/null
@@ -1,146 +0,0 @@
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 5
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "128": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
-    },
-    "512": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    }
-}
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
deleted file mode 100755
index 9808a0231..000000000
--- a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_bmm:_bmm_chunk_fwd_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
deleted file mode 100755
index d35417f40..000000000
--- a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_scan:_chunk_scan_fwd_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
deleted file mode 100755
index 0bdded18d..000000000
--- a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_fwd_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_A100-SXM4-80GB/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_A100-SXM4-80GB/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
deleted file mode 100755
index 19e6fc76c..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_A100-SXM4-80GB/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
+++ /dev/null
@@ -1,110 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 211706.17069911957,
-    "evaluated_configs": 450,
-    "keys": [
-        "HQ",
-        "HK",
-        "IS_CAUSAL",
-        "dropout_p",
-        "BLOCK_DMODEL",
-        "stride_qz",
-        "stride_qh",
-        "stride_qm",
-        "stride_qk",
-        "stride_kz",
-        "stride_kh",
-        "stride_kn",
-        "stride_kk",
-        "stride_vz",
-        "stride_vh",
-        "stride_vn",
-        "stride_vk",
-        "stride_oz",
-        "stride_oh",
-        "stride_om",
-        "stride_on",
-        "stride_bz",
-        "stride_bh",
-        "stride_bm",
-        "stride_bn",
-        "stride_az",
-        "stride_ah",
-        "MAX_SEQLENS_Q",
-        "MAX_SEQLENS_K",
-        "VARLEN",
-        "ACTUAL_BLOCK_DMODEL"
-    ],
-    "cache": {
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 64, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 64, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 64, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.005401020869612694
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.005471085663884878
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.0075958045199513435
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.007605006452649832
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.011812349781394005
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.011950820684432983
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.019297460094094276
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.017475301399827003
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.038042228668928146
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.038091544061899185
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.10096532106399536
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.09481953084468842
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.2949035167694092
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.29237720370292664
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.9560787677764893
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
deleted file mode 100755
index a7b0d4282..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
+++ /dev/null
@@ -1,110 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 86841.6919836998,
-    "evaluated_configs": 240,
-    "keys": [
-        "HQ",
-        "HK",
-        "IS_CAUSAL",
-        "dropout_p",
-        "BLOCK_DMODEL",
-        "stride_qz",
-        "stride_qh",
-        "stride_qm",
-        "stride_qk",
-        "stride_kz",
-        "stride_kh",
-        "stride_kn",
-        "stride_kk",
-        "stride_vz",
-        "stride_vh",
-        "stride_vn",
-        "stride_vk",
-        "stride_oz",
-        "stride_oh",
-        "stride_om",
-        "stride_on",
-        "stride_bz",
-        "stride_bh",
-        "stride_bm",
-        "stride_bn",
-        "stride_az",
-        "stride_ah",
-        "MAX_SEQLENS_Q",
-        "MAX_SEQLENS_K",
-        "VARLEN",
-        "ACTUAL_BLOCK_DMODEL"
-    ],
-    "cache": {
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 32, BLOCK_N: 32, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 128, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.0036645731888711452
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '32', '32', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.0036076440010219812
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.00487453443929553
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '64', '64', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.0048555657267570496
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.006982282269746065
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.006992792245000601
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.010331092402338982
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '256', '256', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.010227189399302006
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.015056964010000229
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.014920394867658615
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.04663630574941635
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.04339428246021271
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.1311214417219162
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.12436506152153015
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.39030927419662476
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/rocm_6.3.1/gpu_AMD_Instinct_MI250X_MI250/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/rocm_6.3.1/gpu_AMD_Instinct_MI250X_MI250/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
deleted file mode 100755
index a7669881a..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.2.0/rocm_6.3.1/gpu_AMD_Instinct_MI250X_MI250/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
+++ /dev/null
@@ -1,98 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 86906.62447404861,
-    "evaluated_configs": 450,
-    "keys": [
-        "HQ",
-        "HK",
-        "IS_CAUSAL",
-        "dropout_p",
-        "BLOCK_DMODEL",
-        "stride_qz",
-        "stride_qh",
-        "stride_qm",
-        "stride_qk",
-        "stride_kz",
-        "stride_kh",
-        "stride_kn",
-        "stride_kk",
-        "stride_vz",
-        "stride_vh",
-        "stride_vn",
-        "stride_vk",
-        "stride_oz",
-        "stride_oh",
-        "stride_om",
-        "stride_on",
-        "stride_bz",
-        "stride_bh",
-        "stride_bm",
-        "stride_bn",
-        "stride_az",
-        "stride_ah",
-        "MAX_SEQLENS_Q",
-        "MAX_SEQLENS_K",
-        "VARLEN",
-        "ACTUAL_BLOCK_DMODEL"
-    ],
-    "cache": {
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '16', '16', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 16, BLOCK_N: 16, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '16', '16', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 16, BLOCK_N: 16, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 16, BLOCK_N: 16, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 2, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 16, BLOCK_N: 16, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 2, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 256, BLOCK_N: 64, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 256, BLOCK_N: 64, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 64, BLOCK_N: 64, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 64, BLOCK_N: 64, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 64, BLOCK_N: 64, PRE_LOAD_V: True, GRID_CU_MULTIP: 2, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 64, BLOCK_N: 64, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 256, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": "BLOCK_M: 256, BLOCK_N: 128, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '16', '16', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.004207286983728409
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '16', '16', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.004182395525276661
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.01809287816286087
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '128', '128', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.017839614301919937
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.09088581800460815
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '512', '512', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.088987797498703
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.23396557569503784
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '1024', '1024', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.23347480595111847
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.6691922545433044
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '2048', '2048', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            0.6695101261138916
-        ],
-        "('32', '32', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '4096', '1', '0', '128', '1', '4096', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            2.025791645050049
-        ],
-        "('32', '8', 'True', '0.0', '128', '0', '128', '4096', '1', '0', '128', '1024', '1', '0', '128', '1', '1024', '0', '128', '4096', '1', '0', '0', '0', '0', '0', '0', '4096', '4096', 'True', '128', 'torch.float16', 'torch.float16', 'torch.float16', 'torch.float32', 'torch.float16', 'torch.int32', 'torch.int32', 'torch.int32')": [
-            2.01798415184021
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
deleted file mode 100755
index 0225f79be..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_bmm_chunk_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_bmm:_bmm_chunk_fwd_kernel)",
-    "total_bench_time_s": 10.309182405471802,
-    "evaluated_configs": 9,
-    "keys": [
-        "chunk_size",
-        "K",
-        "IS_CAUSAL"
-    ],
-    "cache": {
-        "('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 128, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')": [
-            0.04188799858093262
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": false
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
deleted file mode 100755
index 5b20369a8..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_cumsum_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default/cache.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_cumsum_fwd_kernel)",
-    "total_bench_time_s": 8.378965139389038,
-    "evaluated_configs": 7,
-    "keys": [
-        "chunk_size",
-        "nheads"
-    ],
-    "cache": {
-        "('256', '64', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": "BLOCK_SIZE_H: 32, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('256', '64', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32')": [
-            0.05206400156021118
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": false
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json
deleted file mode 100755
index 14c211cf5..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_scan_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-e1d63b4ce9f3ae5e2f38b68d3d8257474338c0a672ac24128b374d342459d7e1/default/cache.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_scan:_chunk_scan_fwd_kernel)",
-    "total_bench_time_s": 36.24500060081482,
-    "evaluated_configs": 11,
-    "keys": [
-        "chunk_size",
-        "hdim",
-        "dstate",
-        "IS_CAUSAL"
-    ],
-    "cache": {
-        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 256, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')": [
-            0.20547200739383698
-        ],
-        "('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32', 'torch.int32')": [
-            0.6873279809951782
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": false
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
deleted file mode 100755
index 2aeb42c51..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_fwd_kernel)",
-    "total_bench_time_s": 10.325033903121948,
-    "evaluated_configs": 9,
-    "keys": [
-        "hdim",
-        "dstate",
-        "chunk_size"
-    ],
-    "cache": {
-        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 128, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')": [
-            0.08188799768686295
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": false
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
deleted file mode 100755
index 3b86e0dae..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_chunk_state_varlen_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-a6c5e7946f5d4b0ba6fa79217784e3780477be6b4708bab85d511e2f96fb9381/default/cache.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_chunk_state:_chunk_state_varlen_kernel)",
-    "total_bench_time_s": 23.77578854560852,
-    "evaluated_configs": 9,
-    "keys": [
-        "hdim",
-        "dstate",
-        "chunk_size"
-    ],
-    "cache": {
-        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 128, BLOCK_SIZE_K: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16')": "BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 32, num_warps: 2, num_ctas: 1, num_stages: 5, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16')": [
-            0.09270399808883667
-        ],
-        "('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16')": [
-            0.01027199998497963
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": false
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
deleted file mode 100755
index c2b3452bf..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
deleted file mode 100755
index c2b3452bf..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
deleted file mode 100755
index 60a6d6935..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 58.42541313171387,
-    "evaluated_configs": 75,
-    "keys": [
-        "dstate",
-        "BLOCK_SIZE_DSTATE",
-        "dim",
-        "nheads_ngroups_ratio"
-    ],
-    "cache": {
-        "('128', '128', '64', '128', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32')": "BLOCK_SIZE_M: 8, num_warps: 2, num_ctas: 1, num_stages: 6, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('128', '128', '64', '128', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32')": [
-            0.003274054965004325
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json
deleted file mode 100755
index 04198714b..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-68916ac9231d70c9dfa4b1081268470f5b25a8dbabb73d3818ba7e74c7fdc03c/default/cache.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_state_passing:_state_passing_fwd_kernel)",
-    "total_bench_time_s": 9.725267887115479,
-    "evaluated_configs": 6,
-    "keys": [
-        "dim"
-    ],
-    "cache": {
-        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')": "BLOCK_SIZE: 512, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32')": "BLOCK_SIZE: 512, num_warps: 4, num_ctas: 1, num_stages: 3, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')": [
-            0.059007998555898666
-        ],
-        "('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32')": [
-            0.08220800012350082
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": false
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
deleted file mode 100755
index 2540ac5c3..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index 5b55f921d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index 04eb1f234..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,347 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 34544.99443292618,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003466148627921939
-        ],
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003575095208361745
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004993442911654711
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006109926383942366
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03988393768668175
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.09943539649248123
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3283151388168335
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0377004146575928
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033776038326323032
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003488453570753336
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033901487477123737
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0032401704229414463
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004394480027258396
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004883989226073027
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0045789312571287155
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006259772460907698
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010929320007562637
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.040549296885728836
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02016238309442997
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1051921397447586
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03749670833349228
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3411431908607483
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0701025053858757
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0497854948043823
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034944734070450068
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0042336732149124146
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005933090578764677
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.026846082881093025
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07565699517726898
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2685732841491699
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8566849827766418
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003527216147631407
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004583046771585941
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0060236589051783085
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.026979871094226837
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08126690983772278
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2932415306568146
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8659728765487671
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00306075531989336
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034781373105943203
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003616524860262871
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0030675148591399193
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0038118616212159395
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003134604310616851
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0055700079537928104
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.009849821217358112
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014783395454287529
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04928915575146675
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.15255023539066315
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.013137963600456715
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4398653507232666
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.4163719415664673
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033607585355639458
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0038107747677713633
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004322108346968889
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033715730533003807
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004160675685852766
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004942106083035469
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00334966741502285
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0050212424248456955
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007804282940924168
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007798833306878805
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014028973877429962
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03204701468348503
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08394649624824524
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08103202283382416
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.23096241056919098
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006906270515173674
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.23079754412174225
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.7025490999221802
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6989444494247437
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            2.3537752628326416
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004250869620591402
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005911743268370628
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011380953714251518
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.05582933872938156
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.16943588852882385
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4909878969192505
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.5911381244659424
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index 1a8388dae..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,387 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 67657.00523352623,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 64, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 64, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 128, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034347970504313707
-        ],
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0035579479299485683
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00523252971470356
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006011391524225473
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.023085465654730797
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08206301927566528
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3279804289340973
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.1915172338485718
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033755453769117594
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003468221053481102
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00334682478569448
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0035435776226222515
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004342962987720966
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00496680336073041
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004553888458758593
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007391158025711775
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011154169216752052
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04036085680127144
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.019932862371206284
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08319558948278427
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03744187951087952
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3325899839401245
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.06968305259943008
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.184262990951538
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003470577532425523
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004544882569462061
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00577146140858531
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.022477485239505768
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04180074483156204
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.16259081661701202
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6357383131980896
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034817454870790243
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00421161251142621
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00583713548257947
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02271271124482155
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07548002898693085
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.17187528312206268
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6434140801429749
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033293836750090122
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003431792138144374
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003589486936107278
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003379078349098563
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0041108024306595325
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033878879621624947
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006029331590980291
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008353302255272865
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.013032807968556881
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04468222334980965
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1537272334098816
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01300885435193777
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.48241302371025085
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.7054001092910767
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033725856337696314
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0037622733507305384
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004256599582731724
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00334113254211843
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004093301948159933
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004860257264226675
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003374352352693677
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005010899156332016
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007828187197446823
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007898394018411636
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014706183224916458
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03305657580494881
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08440500497817993
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08125007152557373
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2514193058013916
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006724500097334385
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.22513994574546814
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8429425954818726
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6514143943786621
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            3.03377103805542
-        ],
-        "('1', '8', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033735581673681736
-        ],
-        "('1', '16', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003457766491919756
-        ],
-        "('1', '32', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003451892174780369
-        ],
-        "('1', '64', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004423843696713448
-        ],
-        "('1', '128', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004672772716730833
-        ],
-        "('1', '256', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006518691312521696
-        ],
-        "('1', '512', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010816759429872036
-        ],
-        "('1', '1024', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01876869797706604
-        ],
-        "('1', '2048', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03477397561073303
-        ],
-        "('1', '4096', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07260602712631226
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004245477728545666
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006100499536842108
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008639966137707233
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04726530611515045
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.14509893953800201
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4709869623184204
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.6025410890579224
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index 04eb1f234..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,347 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 34544.99443292618,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003466148627921939
-        ],
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003575095208361745
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004993442911654711
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006109926383942366
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03988393768668175
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.09943539649248123
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3283151388168335
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0377004146575928
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033776038326323032
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003488453570753336
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033901487477123737
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0032401704229414463
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004394480027258396
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004883989226073027
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0045789312571287155
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006259772460907698
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010929320007562637
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.040549296885728836
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02016238309442997
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1051921397447586
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03749670833349228
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3411431908607483
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0701025053858757
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0497854948043823
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034944734070450068
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0042336732149124146
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005933090578764677
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.026846082881093025
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07565699517726898
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2685732841491699
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8566849827766418
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003527216147631407
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004583046771585941
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0060236589051783085
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.026979871094226837
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08126690983772278
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2932415306568146
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8659728765487671
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00306075531989336
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034781373105943203
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003616524860262871
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0030675148591399193
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0038118616212159395
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003134604310616851
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0055700079537928104
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.009849821217358112
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014783395454287529
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04928915575146675
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.15255023539066315
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.013137963600456715
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4398653507232666
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.4163719415664673
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033607585355639458
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0038107747677713633
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004322108346968889
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033715730533003807
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004160675685852766
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004942106083035469
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00334966741502285
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0050212424248456955
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007804282940924168
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007798833306878805
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014028973877429962
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03204701468348503
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08394649624824524
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08103202283382416
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.23096241056919098
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006906270515173674
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.23079754412174225
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.7025490999221802
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6989444494247437
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            2.3537752628326416
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004250869620591402
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005911743268370628
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011380953714251518
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.05582933872938156
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.16943588852882385
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4909878969192505
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.5911381244659424
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index a7c2af725..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
deleted file mode 100755
index a4569e066..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-c01d6c3dfb6d587c5fb5a1edbe6d606a9804204c3305d997bb82640bf3e80282/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-c01d6c3dfb6d587c5fb5a1edbe6d606a9804204c3305d997bb82640bf3e80282/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index a7c2af725..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-c01d6c3dfb6d587c5fb5a1edbe6d606a9804204c3305d997bb82640bf3e80282/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index 04eb1f234..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,347 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 34544.99443292618,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003466148627921939
-        ],
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003575095208361745
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004993442911654711
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006109926383942366
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03988393768668175
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.09943539649248123
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3283151388168335
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0377004146575928
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033776038326323032
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003488453570753336
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033901487477123737
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0032401704229414463
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004394480027258396
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004883989226073027
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0045789312571287155
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006259772460907698
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010929320007562637
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.040549296885728836
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02016238309442997
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1051921397447586
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03749670833349228
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3411431908607483
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0701025053858757
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0497854948043823
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034944734070450068
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0042336732149124146
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005933090578764677
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.026846082881093025
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07565699517726898
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2685732841491699
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8566849827766418
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003527216147631407
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004583046771585941
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0060236589051783085
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.026979871094226837
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08126690983772278
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2932415306568146
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8659728765487671
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00306075531989336
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0034781373105943203
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003616524860262871
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0030675148591399193
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0038118616212159395
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.003134604310616851
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0055700079537928104
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.009849821217358112
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014783395454287529
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04928915575146675
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.15255023539066315
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.013137963600456715
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4398653507232666
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.4163719415664673
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033607585355639458
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0038107747677713633
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004322108346968889
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0033715730533003807
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004160675685852766
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004942106083035469
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00334966741502285
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0050212424248456955
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007804282940924168
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007798833306878805
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014028973877429962
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03204701468348503
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08394649624824524
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08103202283382416
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.23096241056919098
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006906270515173674
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.23079754412174225
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.7025490999221802
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6989444494247437
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            2.3537752628326416
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.004250869620591402
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005911743268370628
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011380953714251518
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.05582933872938156
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.16943588852882385
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4909878969192505
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.5911381244659424
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index d3eb13852..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
deleted file mode 100755
index e7d868df2..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_6.3.1/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_6.3.1/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
deleted file mode 100755
index 2540ac5c3..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_6.3.1/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
deleted file mode 100755
index c2b3452bf..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-90178d0ab8e71db9cd16710d562763dd010643f28cd21980d5064c3ab782ecaa/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-90178d0ab8e71db9cd16710d562763dd010643f28cd21980d5064c3ab782ecaa/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
deleted file mode 100755
index d6bd3e752..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-90178d0ab8e71db9cd16710d562763dd010643f28cd21980d5064c3ab782ecaa/code_version-669be673bf919df57c10083821a49ac5e1e5629db08d0501c1c298603ad4ecb8/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 113.2074065208435,
-    "evaluated_configs": 75,
-    "keys": [
-        "dstate",
-        "BLOCK_SIZE_DSTATE",
-        "dim",
-        "nheads_ngroups_ratio"
-    ],
-    "cache": {
-        "('128', '128', '64', '128', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32')": "BLOCK_SIZE_M: 16, num_warps: 4, num_ctas: 1, num_stages: 6, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('128', '128', '64', '128', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16', 'torch.int32')": [
-            0.0050251600332558155
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
deleted file mode 100755
index c2b3452bf..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
deleted file mode 100755
index 2540ac5c3..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
deleted file mode 100755
index 2540ac5c3..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-1f316f0fbddd51d950280abb53d67b60494f0cf2c02eeb1b551b0356a33a7dc8/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index de8c75698..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 3830.64182972908,
-    "evaluated_configs": 540,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "is_prefill"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 32, TILE_SIZE: 16, num_warps: 8, num_ctas: 1, num_stages: 6, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 32, TILE_SIZE: 16, num_warps: 8, num_ctas: 1, num_stages: 6, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
-            0.00517149455845356
-        ],
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
-            0.00435659708455205
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index db665c68f..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,347 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 72002.96068787575,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006003436166793108
-        ],
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006077692378312349
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0066948747262358665
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008714776486158371
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03953208029270172
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08529671281576157
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.26893165707588196
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.7998318672180176
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00574119808152318
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006026116665452719
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005752653814852238
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00608863914385438
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006379257421940565
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006695704068988562
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007991316728293896
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00874169822782278
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.021478423848748207
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.038848876953125
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03919544070959091
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08279953896999359
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07393984496593475
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.26520422101020813
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.143253892660141
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8069456219673157
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006098074372857809
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006664188578724861
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008316880092024803
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.032703448086977005
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07349277287721634
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.17093537747859955
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6028901934623718
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006040927022695541
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006674066185951233
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008359000086784363
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.033145882189273834
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0726323127746582
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.16725540161132812
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6085386872291565
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00583583302795887
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00593462772667408
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006117511540651321
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0059266164898872375
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006205248646438122
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005945528391748667
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0069659799337387085
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010612651705741882
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01373966969549656
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04602960869669914
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.12627318501472473
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014789633452892303
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3502292037010193
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0954514741897583
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005718982312828302
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006129336543381214
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006283498369157314
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0057284715585410595
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0061799646355211735
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007406504824757576
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005748743191361427
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006614300422370434
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008334673009812832
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010265326127409935
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.015284508466720581
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03939511626958847
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07506544888019562
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08072267472743988
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1980127990245819
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011478512547910213
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.21105918288230896
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.5597497224807739
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.5454477071762085
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.9615601301193237
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00629243953153491
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008062037639319897
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01422079000622034
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0551898293197155
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.14126861095428467
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3813389539718628
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.2401379346847534
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index 5e025265d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-356e536ec49f15d95d2a2610df8277796c9330d647b924736ed5c106312d4227/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,387 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 81407.73767566681,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 256, BLOCK_M: 32, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 256, BLOCK_M: 32, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0060075013898313046
-        ],
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006072512362152338
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00672190822660923
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008806715719401836
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04485657438635826
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.09946674853563309
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.35092800855636597
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.324418544769287
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0057691833935678005
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006055567879229784
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005804183427244425
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006106226239353418
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006440665107220411
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006741056218743324
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007889878936111927
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008913432247936726
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.021346861496567726
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04106005281209946
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03879227116703987
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0952981486916542
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0731193870306015
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3475594222545624
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.14168496429920197
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.324677586555481
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0060554975643754005
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006669852416962385
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008174276910722256
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03536117449402809
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07847916334867477
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.18417692184448242
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6875757575035095
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006102146580815315
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006687485612928867
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0084276357665658
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03678948059678078
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07642015814781189
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.18387676775455475
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6868319511413574
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005820533260703087
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0059619504027068615
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006105729844421148
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005979663692414761
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0062386938370764256
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005969700403511524
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007005539257079363
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011318272911012173
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01767335832118988
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.048929426819086075
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1755041629076004
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01716405153274536
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.5103733539581299
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.8636406660079956
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0057952022179961205
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006148397456854582
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006287233904004097
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005749743431806564
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006230741273611784
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007458249572664499
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00579081941395998
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006615426391363144
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00870793592184782
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01026986539363861
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.015668710693717003
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.040304314345121384
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0959310457110405
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0849064514040947
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2615358829498291
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011502742767333984
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.25011205673217773
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8817259073257446
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.7242566347122192
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            3.2800190448760986
-        ],
-        "('1', '8', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00581999821588397
-        ],
-        "('1', '16', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0058884210884571075
-        ],
-        "('1', '32', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0058985608629882336
-        ],
-        "('1', '64', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0065222084522247314
-        ],
-        "('1', '128', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008244817145168781
-        ],
-        "('1', '256', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011564841493964195
-        ],
-        "('1', '512', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.021496908739209175
-        ],
-        "('1', '1024', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.038903381675481796
-        ],
-        "('1', '2048', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07334144413471222
-        ],
-        "('1', '4096', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1418607085943222
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006298307329416275
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008144522085785866
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014301695860922337
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.06052287295460701
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1740308254957199
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.4944685995578766
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.7257815599441528
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-48e3cb6cd6592d4b55826bce9ff39781f5f8d3beec28e171da3dd4e5109ad732/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-48e3cb6cd6592d4b55826bce9ff39781f5f8d3beec28e171da3dd4e5109ad732/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index a7c2af725..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-48e3cb6cd6592d4b55826bce9ff39781f5f8d3beec28e171da3dd4e5109ad732/code_version-5929ad03b9fa9764bf7161e5d9bf068628b7668ea2c33d6b1c3d10ebc8b7a0a6/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
deleted file mode 100755
index a4569e066..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 5b55f921d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index a7c2af725..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-2e68df1b2ccc61cd52696753033f640191f6d65a4eba454efdb10ac09cee2f95/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index db665c68f..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-67c5278a57a01b9e312f17a648cae5031730e47c496c02f3a23832e14fc93b14/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,347 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 72002.96068787575,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 2, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 16, BLOCK_M: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006003436166793108
-        ],
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006077692378312349
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0066948747262358665
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008714776486158371
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03953208029270172
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08529671281576157
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.26893165707588196
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.7998318672180176
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00574119808152318
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006026116665452719
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005752653814852238
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00608863914385438
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006379257421940565
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006695704068988562
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007991316728293896
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00874169822782278
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.021478423848748207
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.038848876953125
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03919544070959091
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08279953896999359
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07393984496593475
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.26520422101020813
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.143253892660141
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.8069456219673157
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006098074372857809
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006664188578724861
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008316880092024803
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.032703448086977005
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07349277287721634
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.17093537747859955
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6028901934623718
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006040927022695541
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006674066185951233
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008359000086784363
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.033145882189273834
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0726323127746582
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.16725540161132812
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.6085386872291565
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00583583302795887
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00593462772667408
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006117511540651321
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0059266164898872375
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006205248646438122
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005945528391748667
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0069659799337387085
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010612651705741882
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01373966969549656
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04602960869669914
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.12627318501472473
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014789633452892303
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3502292037010193
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.0954514741897583
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005718982312828302
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006129336543381214
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006283498369157314
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0057284715585410595
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0061799646355211735
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007406504824757576
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005748743191361427
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006614300422370434
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008334673009812832
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.010265326127409935
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.015284508466720581
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03939511626958847
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.07506544888019562
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08072267472743988
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1980127990245819
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.011478512547910213
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.21105918288230896
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.5597497224807739
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.5454477071762085
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.9615601301193237
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00629243953153491
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008062037639319897
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01422079000622034
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0551898293197155
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.14126861095428467
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.3813389539718628
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.2401379346847534
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index a7c2af725..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index 901033d5b..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 1805.8680896759033,
-    "evaluated_configs": 540,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
-            0.003383171046152711
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index d3eb13852..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
deleted file mode 100755
index 165560713..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 115.25903606414795,
-    "evaluated_configs": 90,
-    "keys": [
-        "num_query_heads",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '128', '128', '16')": "TILE_SIZE: 128, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '128', '128', '16')": [
-            0.0028324048034846783
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
deleted file mode 100755
index e7d868df2..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.0/rocm_torch_6.2.41134-65d174c3e/gpu_AMD_Instinct_MI300X/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
deleted file mode 100755
index c2b3452bf..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-cc858283bc49d8a3efa953e9dcc3faca03c9ca6c80289f2653f6831edfaeaee3/code_version-27348010fbb4e918aa147bf4ad8422523d048c0b6250f4179e817019fd4ce395/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
deleted file mode 100755
index 2540ac5c3..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/attn_fwd/autotune_config-ce006d964f672cc6e6ac68d422a3ec8e7dcaf9bdcbbbacfbecf47d1f1d0e9d25/code_version-0a43fd896fb3d6519678247aeba94610b596378a3138e88995ca3569d6672a96/tune_features-df62f53ce178f143b59631de953c946e43811ff1b34cd71e422dfdf14ac35bb9/kernel_configs-a70f97e8b3e7aaf9f4a4f7e850b935d2d1b3ad8cd6ad1d0843bb426e13694ae9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_flash_attention:attn_fwd)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index 710e7b803..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 2846.828315258026,
-    "evaluated_configs": 540,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "is_prefill"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
-            0.003479903331026435
-        ],
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
-            0.003208082402125001
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 17a69de08..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 1721.1768200397491,
-    "evaluated_configs": 5400,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "is_prefill"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
-            0.004668071866035461
-        ],
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
-            0.0035326406359672546
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
deleted file mode 100755
index 870c8b475..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 863.3593587875366,
-    "evaluated_configs": 2160,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "is_prefill"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
-            0.007799518760293722
-        ],
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
-            0.006862994749099016
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
deleted file mode 100755
index 472c55180..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-fd64e5be2a264a3cb0d9e5b63e0346154385787c2cc6fdd11b2135f5ec0e2451/default/cache.json
+++ /dev/null
@@ -1,387 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
-    "total_bench_time_s": 32995.41111779213,
-    "evaluated_configs": 2160,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 3, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '8', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '16', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '32', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 6, num_consumer_groups: 4, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '64', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '128', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '256', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '512', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '1024', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '2048', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1', '4096', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 128, BLOCK_M: 16, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 64, BLOCK_M: 32, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '32', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006897487211972475
-        ],
-        "('64', '64', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007865289226174355
-        ],
-        "('128', '128', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.012806367129087448
-        ],
-        "('512', '512', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.11409414559602737
-        ],
-        "('1024', '1024', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.36400967836380005
-        ],
-        "('2048', '2048', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.291664481163025
-        ],
-        "('4096', '4096', '4096', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            4.830662727355957
-        ],
-        "('1', '16', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0067154536955058575
-        ],
-        "('16', '16', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007009030785411596
-        ],
-        "('1', '32', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006567405071109533
-        ],
-        "('32', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006921715103089809
-        ],
-        "('1', '64', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007554848212748766
-        ],
-        "('64', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007870307192206383
-        ],
-        "('1', '128', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.012347826734185219
-        ],
-        "('128', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.018965136259794235
-        ],
-        "('1', '512', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03259870782494545
-        ],
-        "('512', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.11627256125211716
-        ],
-        "('1', '1024', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0549253448843956
-        ],
-        "('1024', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.37127885222435
-        ],
-        "('1', '2048', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.09950052946805954
-        ],
-        "('2048', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.3021571636199951
-        ],
-        "('1', '4096', '1', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.1874120533466339
-        ],
-        "('4096', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            4.851548671722412
-        ],
-        "('2', '2', '2', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006778071168810129
-        ],
-        "('8', '8', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006958519574254751
-        ],
-        "('16', '16', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006996186450123787
-        ],
-        "('4', '4', '4', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006850973702967167
-        ],
-        "('32', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00791214406490326
-        ],
-        "('8', '8', '8', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006878295913338661
-        ],
-        "('64', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.013943970203399658
-        ],
-        "('128', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02429494820535183
-        ],
-        "('256', '256', '128', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03789611533284187
-        ],
-        "('512', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.15952551364898682
-        ],
-        "('1024', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.5120749473571777
-        ],
-        "('256', '256', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03336550295352936
-        ],
-        "('2048', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            1.803341269493103
-        ],
-        "('4096', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            6.802962303161621
-        ],
-        "('1', '2', '1', '2', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0067731114104390144
-        ],
-        "('8', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007123402785509825
-        ],
-        "('16', '16', '4', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.013310004025697708
-        ],
-        "('1', '4', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006687874905765057
-        ],
-        "('16', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00769382668659091
-        ],
-        "('32', '32', '8', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014694097451865673
-        ],
-        "('1', '8', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006742445286363363
-        ],
-        "('32', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.00831019226461649
-        ],
-        "('64', '64', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02136719599366188
-        ],
-        "('64', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0192007627338171
-        ],
-        "('128', '128', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.04041781276464462
-        ],
-        "('256', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.09291289746761322
-        ],
-        "('512', '512', '128', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.28874820470809937
-        ],
-        "('512', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2564668357372284
-        ],
-        "('1024', '1024', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.918175995349884
-        ],
-        "('1', '256', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02123316191136837
-        ],
-        "('1024', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.7775593996047974
-        ],
-        "('2048', '2048', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            3.24080228805542
-        ],
-        "('2048', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            2.575653076171875
-        ],
-        "('4096', '4096', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            12.103424072265625
-        ],
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0063226004131138325
-        ],
-        "('16', '32', '16', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0069314902648329735
-        ],
-        "('32', '64', '32', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007872514426708221
-        ],
-        "('64', '128', '64', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01249010395258665
-        ],
-        "('256', '512', '256', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08108722418546677
-        ],
-        "('512', '1024', '512', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2769642770290375
-        ],
-        "('1024', '2048', '1024', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.986293613910675
-        ],
-        "('2048', '4096', '2048', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            3.6365156173706055
-        ],
-        "('16', '32', '8', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0069512976333498955
-        ],
-        "('32', '64', '16', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007947840727865696
-        ],
-        "('64', '128', '32', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.012514323927462101
-        ],
-        "('256', '512', '128', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.08159603923559189
-        ],
-        "('512', '1024', '256', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.2810220718383789
-        ],
-        "('1024', '2048', '512', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.9966282248497009
-        ],
-        "('2048', '4096', '1024', '4096', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            3.6692380905151367
-        ],
-        "('1', '8', '1', '4', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0068373410031199455
-        ],
-        "('1', '16', '1', '8', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.006867218296974897
-        ],
-        "('1', '32', '1', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.0068841795437037945
-        ],
-        "('1', '64', '1', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.007741984911262989
-        ],
-        "('1', '128', '1', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.01235784962773323
-        ],
-        "('1', '256', '1', '128', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.02117188833653927
-        ],
-        "('1', '512', '1', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.03320121765136719
-        ],
-        "('1', '1024', '1', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.05449502542614937
-        ],
-        "('1', '2048', '1', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.09907654672861099
-        ],
-        "('1', '4096', '1', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.19813136756420135
-        ],
-        "('16', '32', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.008454970084130764
-        ],
-        "('32', '64', '32', '32', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.014529259875416756
-        ],
-        "('64', '128', '64', '64', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.026538236066699028
-        ],
-        "('256', '512', '256', '256', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.18360291421413422
-        ],
-        "('512', '1024', '512', '512', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.5871036052703857
-        ],
-        "('1024', '2048', '1024', '1024', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            2.0788326263427734
-        ],
-        "('2048', '4096', '2048', '2048', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            7.741743564605713
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 87360ce3e..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 859.6228244304657,
-    "evaluated_configs": 5400,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "is_prefill"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 32, TILE_SIZE: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 9, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 32, TILE_SIZE: 64, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 9, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
-            0.007184021640568972
-        ],
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
-            0.006555985659360886
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 5b55f921d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 5b55f921d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index 47793d9a0..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 828.1587612628937,
-    "evaluated_configs": 540,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "is_prefill"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None",
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": "BLOCK_M: 16, TILE_SIZE: 32, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'True')": [
-            0.0039040199480950832
-        ],
-        "('32', '4', '16', '128', '128', '0', '1', '1', 'False')": [
-            0.0035902990493923426
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
deleted file mode 100755
index a83cef97e..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-5519d9b1918ec274a537269f5fbd0ad024b0e4043a66d66c7a04f6cac9f334e4/default/cache.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention:kernel_unified_attention_2d)",
-    "total_bench_time_s": 363.07500290870667,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 256, BLOCK_M: 512, num_warps: 8, num_ctas: 1, num_stages: 8, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            4.2064047534040583e-07
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-96fc3b4e585fc8cfcb4fcdd974640839b5a5889cf4f54dbf57ad6a3439b671d0/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-96fc3b4e585fc8cfcb4fcdd974640839b5a5889cf4f54dbf57ad6a3439b671d0/default/cache.json
deleted file mode 100755
index 6f91d97c5..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-96fc3b4e585fc8cfcb4fcdd974640839b5a5889cf4f54dbf57ad6a3439b671d0/default/cache.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
-    "total_bench_time_s": 364.13932609558105,
-    "evaluated_configs": 540,
-    "keys": [
-        "MAX_SEQ_Q",
-        "MAX_SEQ_K",
-        "AVG_SEQ_Q",
-        "AVG_SEQ_K",
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3"
-    ],
-    "cache": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": "BLOCK_N: 32, BLOCK_M: 16, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 3, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('16', '16', '16', '16', '32', '4', '16', '128', '128', '0', '1', '1')": [
-            0.005123822949826717
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 5b55f921d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-02e42aebb852f5434774bed2b71d5068bfc814b8ac9e51b22daea515774dea00/tune_features-962cbe32858d7341dc68665ec8ce800f0f76b8b166e05ee23529e0fa6ab3a327/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f52792779faa0af779cada63f2df14c185a5b34f253646e36c07bb8926f93dc8/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-88d41f86261407aa0eaf355d2d650ddaee68bdf62e28c6cc74f4e1bcacddcfd8/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f52792779faa0af779cada63f2df14c185a5b34f253646e36c07bb8926f93dc8/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-88d41f86261407aa0eaf355d2d650ddaee68bdf62e28c6cc74f4e1bcacddcfd8/default/cache.json
deleted file mode 100755
index a4569e066..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_2d/autotune_config-f52792779faa0af779cada63f2df14c185a5b34f253646e36c07bb8926f93dc8/code_version-acc8ddf2fa7ddbee69152b55dbfd76d34237b498240e5018c8d39a810bdfc157/tune_features-1951755092d3da5141f4b15aeee3b864a29766ecdb441f9f148e955fcfae08c6/kernel_configs-88d41f86261407aa0eaf355d2d650ddaee68bdf62e28c6cc74f4e1bcacddcfd8/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_attention_tuned:kernel_unified_attention_2d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index 368c26881..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-2fb9da5c61f738671835102ccdd28d50088d75910fde234f351106a0ce7f26c7/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 1181.724599123001,
-    "evaluated_configs": 540,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
-            0.0031476265285164118
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 6b8ebea6e..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-43db3e456dad9b8f5957c072620e2352182b9932c3afe84acc25ee55de7820e6/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 720.5651552677155,
-    "evaluated_configs": 5400,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 32, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
-            0.003578872187063098
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
deleted file mode 100755
index 12932629d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-557cfd5eb85d621d36e3752d23a6edf37fca5081ba21b7c6224075f8030bebe5/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-215ec5d35f38d4a8d3e16b38a76fd4814aeed8b301040d2ed65a74f1e0d1b7c1/default/cache.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 368.8641257286072,
-    "evaluated_configs": 2160,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 6, num_consumer_groups: 2, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
-            0.003861392615363002
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index 02018ed3d..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-5ecd4b3212a090c33f40bb38c9569203545d5cc0c121a354acfa9d8dbfac00e3/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 362.4042990207672,
-    "evaluated_configs": 5400,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 4, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 9, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
-            0.0031293570064008236
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index d3eb13852..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index d3eb13852..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-e113dd0d01ca176368717416663afafb242e95298cb899bcd104df7a0d5b519a/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
deleted file mode 100755
index 81ab50506..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-eff99677f7c0c1715ee99c9f1c8cf2a597630dd934ea82c3a3f4cdcd26d2e859/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-e5aec45b908a71ecbc7e3dfff6ae8a81f704e5b537b5417eda24ffd902cb2ead/default/cache.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 360.36944031715393,
-    "evaluated_configs": 540,
-    "keys": [
-        "num_query_heads",
-        "num_queries_per_kv",
-        "BLOCK_SIZE",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "SLIDING_WINDOW",
-        "stride_k_cache_3",
-        "stride_v_cache_3",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": "BLOCK_M: 16, TILE_SIZE: 16, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '4', '16', '128', '128', '0', '1', '1', '16')": [
-            0.0035186302848160267
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
deleted file mode 100755
index d3eb13852..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/kernel_unified_attention_3d/autotune_config-f46844f6828085003f903385fcad5ba9796c817642f8ac3bcd306f51e3385240/code_version-ae21affde9839117e60834f537cba394504b3944d86f36c648ec90c44bc1268d/tune_features-4d3d317831d8d2162f639122432a06319f08e41201cc90829d1e209768044696/kernel_configs-18df728547c73c192cc6a71218efbe904108ed848ef261857bd84580da7bb6c9/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:kernel_unified_attention_3d)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3a6fc1c46225b2f7d0bc848adf5344e3dda28dcbb0957584ee22138ce6625218/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3a6fc1c46225b2f7d0bc848adf5344e3dda28dcbb0957584ee22138ce6625218/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
deleted file mode 100755
index d53f63026..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3a6fc1c46225b2f7d0bc848adf5344e3dda28dcbb0957584ee22138ce6625218/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 367.19957637786865,
-    "evaluated_configs": 900,
-    "keys": [
-        "num_query_heads",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '128', '128', '16')": "TILE_SIZE: 32, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '128', '128', '16')": [
-            0.0031237052753567696
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
deleted file mode 100755
index e30476d4b..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-3ec72a24614e22e4f8984d4b3b95b35928fcaf36a5101e03f51287f47aa54959/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 360.16377663612366,
-    "evaluated_configs": 900,
-    "keys": [
-        "num_query_heads",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '128', '128', '16')": "TILE_SIZE: 32, num_warps: 8, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '128', '128', '16')": [
-            0.0031249839812517166
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-52c92ceef6d420c78c5c5940c8b38fe551467bdabe0ca1810415fbe039359610/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-52c92ceef6d420c78c5c5940c8b38fe551467bdabe0ca1810415fbe039359610/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
deleted file mode 100755
index acb692e9e..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-52c92ceef6d420c78c5c5940c8b38fe551467bdabe0ca1810415fbe039359610/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 102.50655031204224,
-    "evaluated_configs": 90,
-    "keys": [
-        "num_query_heads",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '128', '128', '16')": "TILE_SIZE: 16, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '128', '128', '16')": [
-            0.0022160690277814865
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-5c087adac96d09b2060f573486a99205cda08f58e544b9acfd14918832e2e582/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-5c087adac96d09b2060f573486a99205cda08f58e544b9acfd14918832e2e582/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
deleted file mode 100755
index e7d868df2..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-5c087adac96d09b2060f573486a99205cda08f58e544b9acfd14918832e2e582/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
deleted file mode 100755
index 4230de538..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-6c8b849c4a86df1c035ff18afd7f97dbde21b6a9d2a4cd061e7d427b58926deb/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-4ff2dae47e05b54eacd30c273f7ef180b7005c9c803f2dcdc06c54c6231a6d0a/default/cache.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 77.74497675895691,
-    "evaluated_configs": 90,
-    "keys": [
-        "num_query_heads",
-        "HEAD_SIZE",
-        "HEAD_SIZE_PADDED",
-        "NUM_SEGMENTS_PER_SEQ"
-    ],
-    "cache": {
-        "('32', '128', '128', '16')": "TILE_SIZE: 32, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None"
-    },
-    "timings": {
-        "('32', '128', '128', '16')": [
-            0.002219553105533123
-        ]
-    },
-    "timings_data": {
-        "labels": [
-            "ms"
-        ],
-        "rep_t_ms": 100,
-        "warmup_t_ms": 25,
-        "cuda_graphs": true
-    }
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-72dc6d55a572ac899f3da4b41257cc6aeb8cad69a0fc94b16aa73ca9c82b4012/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-72dc6d55a572ac899f3da4b41257cc6aeb8cad69a0fc94b16aa73ca9c82b4012/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
deleted file mode 100755
index e7d868df2..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-72dc6d55a572ac899f3da4b41257cc6aeb8cad69a0fc94b16aa73ca9c82b4012/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
deleted file mode 100755
index e7d868df2..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-cf2079b9132ed361b2c82edfb1bfdd33dd125e77b296333831cd769205dc5ed5/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-f130aa2e7a5258b0e95f6494e2db37f5dea3ccbb97ee8feed09d2d36599bff88/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-071e784de56797ed9764ebe722a0ebf6c8c9719610c15e34a8b3a8f9fe7252ae/default/cache.json b/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-f130aa2e7a5258b0e95f6494e2db37f5dea3ccbb97ee8feed09d2d36599bff88/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-071e784de56797ed9764ebe722a0ebf6c8c9719610c15e34a8b3a8f9fe7252ae/default/cache.json
deleted file mode 100755
index e7d868df2..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/dejavu_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/reduce_segments/autotune_config-f130aa2e7a5258b0e95f6494e2db37f5dea3ccbb97ee8feed09d2d36599bff88/code_version-b37262dacaff6036668bc34e1f1c608a3ef676c234fe6d6fbc17ee96f402b34b/tune_features-edcf573eb72713c62e0d4888b7888beddadaeb4ef856ad3aafaef12e7189c730/kernel_configs-071e784de56797ed9764ebe722a0ebf6c8c9719610c15e34a8b3a8f9fe7252ae/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(ibm_triton_lib.kernels.triton_unified_grid:reduce_segments)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/triton-dejavu b/triton-dejavu
index 5c7d4fa99..8f06d4903 160000
--- a/triton-dejavu
+++ b/triton-dejavu
@@ -1 +1 @@
-Subproject commit 5c7d4fa9915134d1ce12c4e244015ee705cd5df3
+Subproject commit 8f06d4903056e30867620576b251489c3e9baa8c
diff --git a/tune_log_g4small.txt b/tune_log_g4small.txt
deleted file mode 100644
index e87bf0ece..000000000
--- a/tune_log_g4small.txt
+++ /dev/null
@@ -1,14544 +0,0 @@
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.21952056884766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.08415967226028, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 95.1449602842331, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.96367955207825, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.01135909557343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.8116797208786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.98880136013031, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.25296032428741, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.80431878566742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.81823897361755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.77791965007782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.6235209107399, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.30752062797545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.14223992824554, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.07647919654846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.75008189678192, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.48751831054688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.90320003032684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.32399988174438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.56895941495895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.95551943778992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.86191987991333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.41519868373871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.11391997337341, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.62847971916199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.37600016593933, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.97728019952774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.66848009824753, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.06767928600313, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.1966392993927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.42016017436981, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.24863958358765, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.62255942821503, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.56479978561401, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.26896071434021, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.28272032737732, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.04415988922119, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.0792008638382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.50959920883179, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.26288032531738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.2688010931015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.45119988918304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.7017593383789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.7870409488678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.5108813047409, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.76943969726562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.97296071052551, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.18736004829407, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.30688011646271, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.32127964496613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.01535964012146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.80000078678131, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.95936036109924, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.87296104431152, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.53248119354248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.3331196308136, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.66159987449646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.10959911346436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.59728074073792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.66848075389862, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.40544068813324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.71199989318848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.05631959438324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.78655993938446, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.29023921489716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.43103921413422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.36160099506378, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.17599940299988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.5961594581604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.2511990070343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.91151797771454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.92255985736847, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.28623974323273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.46527969837189, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.73632049560547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.31391978263855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.36176180839539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.91471946239471, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.6344004869461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.98496055603027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.47920024394989, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.38431930541992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.95120060443878, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.2299201488495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.37504053115845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.74495947360992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.05023980140686, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 140.11984050273895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.35696125030518, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.0168000459671, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.5961595773697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.19232034683228, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.04480040073395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.37583923339844, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.5473598241806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.8715192079544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.84543907642365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.47103989124298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.00479996204376, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.70496046543121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.76848137378693, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.4023984670639, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.50992047786713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.8615991473198, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.36511981487274, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.80255913734436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.90768045186996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.87664031982422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.71503937244415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.07711946964264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.30224001407623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.21104001998901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.1295998096466, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.3929613828659, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.92687910795212, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.64911925792694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.03391945362091, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.43920040130615, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.37056005001068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.80703997612, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.48832023143768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.26639986038208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 100.1515206694603, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.96400046348572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.19855999946594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.8088002204895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 99.56783890724182, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.04335874319077, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.51872050762177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.60496008396149, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.72272074222565, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.21567976474762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.23936033248901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.17279994487762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.64064049720764, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.37472021579742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.7777590751648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.35952007770538, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.8687995672226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.75968039035797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.43968081474304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.30911993980408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.02687954902649, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.53423988819122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.84975969791412, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.03359889984131, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.55631983280182, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.79584074020386, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.93856072425842, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.30256044864655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.96799790859222, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.97087967395782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.3231999874115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.31471943855286, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.96576023101807, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.367520570755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.65295994281769, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.47903847694397, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.14735960960388, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.33376026153564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.4876799583435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.05407989025116, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.07632029056549, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.60560071468353, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.70032131671906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.22111988067627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.88960099220276, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.00464046001434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.16095888614655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.5769602060318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.2671993970871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.15855932235718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.54255974292755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.0723204612732, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.99807965755463, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.6131204366684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.03247916698456, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.36495912075043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.80544030666351, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.19104027748108, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 131.7660790681839, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.06976091861725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.76255965232849, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 102.42143988609314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.3687995672226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.9324803352356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.37872052192688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.53839981555939, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.99808025360107, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.91008031368256, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.22000086307526, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.40383970737457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.48848032951355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.6155207157135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.22688007354736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.99743902683258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.76015901565552, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.34768152236938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.54607999324799, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.20000076293945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.48479974269867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.52512013912201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.23184096813202, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.87504029273987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.6572802066803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.28063881397247, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.78575921058655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.30751979351044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.29695904254913, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.46479988098145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.60704064369202, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.41152131557465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.95151948928833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.23456013202667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.39776062965393, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.10367977619171, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.68639945983887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.25311958789825, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.8051209449768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.3476802110672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.16704058647156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.41008043289185, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.75903964042664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.16175937652588, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.17488014698029, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.29679989814758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.19839978218079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.41855978965759, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.36767852306366, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.24863910675049, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.33199977874756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.05583798885345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.09647822380066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.77983915805817, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.7124798297882, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.70815801620483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.71151959896088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.72240233421326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.77952075004578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.56304037570953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.02832114696503, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.07695853710175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.40479922294617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.9411200284958, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.3185601234436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.49311912059784, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.10111904144287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.27872025966644, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.73104083538055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.63423943519592, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.74831986427307, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.85120010375977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.6529585123062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.87776100635529, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.250559091568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.06048035621643, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.62160098552704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.8033584356308, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.68688011169434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 120.04672050476074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.29199922084808, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.9167994260788, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.55887842178345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.2655987739563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.06879901885986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.35344219207764, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.46080029010773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.7460800409317, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.9990395307541, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.17199909687042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.28848087787628, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.12816047668457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.47775924205781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.46000039577484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.22399914264679, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.9752002954483, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.80512034893034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.72831928730011, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.7443196773529, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.141921043396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.5223995447159, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.50720012187958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.36864054203033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.70255970954895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.24624001979828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.74352025985718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.95888149738312, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.94751930236816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.72047889232635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.25104022026062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.62576007843018, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.32144069671631, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.92720019817352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.7012814283371, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.63279938697815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.06560027599335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.5304002761841, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.85968053340912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.41184139251709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.9236798286438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.36591958999634, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.86223900318146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.77183973789215, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.62944054603577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.11712086200714, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.81632018089294, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.0520007610321, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.4864000082016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.69871914386749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.40607988834381, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.2665604352951, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.5385603904724, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.19055914878845, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.74703991413116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.71455979347229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.6947191953659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.65887999534607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.05568087100983, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.01648020744324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.14928007125856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.02896082401276, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.32528102397919, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.6811203956604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.23071992397308, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.77631902694702, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.06111943721771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.93888032436371, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.60496175289154, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.0680000782013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.84319949150085, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.4415991306305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.13472127914429, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.74336063861847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.73904037475586, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.28559994697571, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.27183973789217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.1975998878479, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.65455877780914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 149.7278380393982, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.15903985500336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.024799823761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.91360116004944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.57583999633789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.71583950519562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 149.4928002357483, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.28992092609406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.74128079414368, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.632958650589, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.46528112888336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.60336017608643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 150.0075203180313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.48351860046387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.64655888080597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.55056059360504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 123.67999970912933, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.87264001369476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 150.05552113056183, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.58527982234955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.86480045318604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.49600088596344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.97728097438812, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.11856126785278, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.73776030540466, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.58143985271454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.38784039020538, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.42559957504272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.33104085922241, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.37104201316833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.50160014629364, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.28943920135498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.50591969490051, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.0841612815857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 134.6785604953766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.56159949302673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.84159994125366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.31999981403351, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.83199942111969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.42111897468567, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.46144092082977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.34768056869507, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 134.23024117946625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.45343911647797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.84159934520721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.46431994438171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.75375938415529, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.53247892856598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.52127921581268, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 103.80144000053406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 134.35039937496185, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.07647931575775, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 101.87664031982422, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.11376023292542, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.85967814922333, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.84575998783112, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.45504021644592, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.59488093852997, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.74128103256226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.36368036270142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.89103972911835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.68336057662964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.91631984710693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.44480013847351, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.45135998725891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.40288043022156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.58799934387207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.28895998001099, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.3340802192688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.92304134368896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.30864036083221, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.1041601896286, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.21472001075745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.94800066947937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.44304025173187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.37984156608582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.39584136009216, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.21984088420868, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.46896076202393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.42127895355225, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.31056010723114, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.45711827278137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.57376039028168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.37167930603027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.409921169281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.98415994644165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.2455998659134, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.7712004184723, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.44863891601562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.43071913719177, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.6139190196991, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.42127883434296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.65392065048218, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.86176061630249, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 133.6020803451538, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.33040022850037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.65999972820283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.97120010852814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 132.95343935489655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.95071983337402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.15903961658478, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.68064057826996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.98351907730103, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.36175954341888, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.51632022857666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.44208121299744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 144.69775915145874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.07408046722412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.52512013912201, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 104.69024002552032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.5959997177124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.2868800163269, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.76304030418396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.39680075645447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.35983788967133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.19343960285187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.71935939788818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 105.7367992401123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.60032165050507, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.35871911048889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.34912085533142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.36272060871124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.61008059978485, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.5107192993164, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.61135995388031, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.15823984146118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.62480127811432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.68415987491608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.47103977203369, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.61184000968933, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.23424017429352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.47151911258698, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.7670407295227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.01759910583496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.87151968479156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 117.00736105442047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.4523184299469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.98016059398651, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.85216009616852, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.68816030025482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.799840092659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.98751997947693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.63568019866943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.80623948574066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.47712087631226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.29728031158447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.5996813774109, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.76976084709167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.77055990695953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.9096006155014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.09008026123047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.53312063217163, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.26447820663452, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.49983859062195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.35967993736267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.06191968917847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.03199982643127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 145.27120053768158, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.6116794347763, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.5921607017517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.65600085258484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.43024051189423, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.62656140327454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.24607956409453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.0454398393631, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.67343997955322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.01455950737, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.62752044200897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.32335925102234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.9548796415329, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.6206395626068, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.89376056194305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.85776019096375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.4478394985199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.97279894351959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.38175892829895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.45024061203003, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.90512001514435, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.17328011989594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.41440045833588, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.26928174495697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 128.43840062618256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.7868790626526, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.65120136737823, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.12335777282715, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.25151884555817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 226.0313606262207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.28895950317383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 226.62800073623657, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.0836799144745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.52304005622864, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.67568051815033, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.33920097351074, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.82191979885101, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.00879991054535, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.1534389257431, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.25584101676941, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.10847973823547, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.828320145607, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.13487887382507, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.21135914325714, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.77072060108185, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.1289598941803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.72975957393646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.0329601764679, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.58287966251372, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.964799284935, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.12879979610443, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.29088127613068, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.74480032920837, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.1448016166687, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.63503885269165, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.68143939971924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.30496048927306, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.66271877288818, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.46016025543213, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 106.95600152015686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 119.1267204284668, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.16656005382538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.56544029712677, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.83792006969452, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 127.81999886035919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.80527949333191, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.81472074985504, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.24527931213377, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.70640063285828, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.6512006521225, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.0147204399109, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.72767961025238, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.4935985803604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.47487878799437, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.1350394487381, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.74000024795532, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.30271935462952, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.80384063720703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 130.26336252689362, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.5336000919342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.71200048923492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.774240732193, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.1995198726654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 122.43903994560242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 129.79679882526398, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.79695963859558, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.90367949008942, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.72384083271027, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.81775867938995, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.1662393808365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.96143901348114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.18992066383362, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.4731193780899, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.2235198020935, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.8363196849823, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.0247997045517, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.85967993736267, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.47679746150969, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.01503896713257, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.11951959133148, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.44816052913666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.84495902061462, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.42847967147827, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.0433599948883, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.43791949748993, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.99712014198303, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.29888021945953, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.53103935718536, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.45040047168732, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 135.70480108261108, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.9902400970459, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.6401596069336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.67295944690704, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.02431833744049, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.87152111530304, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.74975979328156, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.56224083900452, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.25664114952087, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.51568126678467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.98176038265228, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.36960124969482, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.4332801103592, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.68464028835297, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.0854400396347, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.6854418516159, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 136.23440027236938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.72208142280579, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.92464065551758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 110.55232048034668, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 126.52096033096313, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.72879981994629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.4087997674942, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.4590392112732, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.65296113491058, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.88736009597778, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.3200011253357, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.17872095108032, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.52735924720764, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.9382404088974, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.43007957935333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.02559900283813, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.87903976440428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 114.18287992477417, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.04304099082947, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 115.41071951389313, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.43984043598175, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.95152008533478, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.26992058753967, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 140.4124802350998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.09360003471375, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.29375958442688, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.38719940185547, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.85760045051575, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 109.08112049102783, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 140.84991931915283, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.27647960186005, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.60351896286011, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.8652799129486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.21391940116882, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.69903922080994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 140.53727984428406, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 111.84816062450409, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.4835205078125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 137.46832013130188, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.39151978492737, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 108.90143871307373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 140.96384048461914, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 112.27280080318451, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 107.50783979892731, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.46175932884216, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 142.08927989006042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.96495938301086, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 142.3358392715454, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 139.09039855003357, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 142.34272003173828, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.99744033813477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 142.0116800069809, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.1139212846756, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.59056103229523, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.37391996383667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.40623903274536, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.6054402589798, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.82912003993988, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.2590389251709, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.4771190881729, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.97791957855225, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.39360105991364, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.63711893558502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.01119816303253, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.52016067504883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.27679932117462, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.75552117824554, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.8719997406006, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.73103952407837, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.143679022789, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.49343955516815, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.17631912231445, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.16864049434662, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.76463949680328, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 144.22991931438446, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.81103885173798, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.08687925338745, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.0835200548172, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.50224137306213, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.12000048160553, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.63855934143066, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.64975929260254, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.96927952766418, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.81663870811462, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.7153605222702, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.07488214969635, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 143.67088079452515, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.09632074832916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.2899204492569, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.77568006515503, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 144.21407878398895, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.98543965816498, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.23311924934387, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.79552125930786, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.00015926361084, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.90256083011627, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.06847989559174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.55519950389862, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.89967954158783, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.53759944438934, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.66671872138977, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.73455953598022, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 117.17920005321503, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.48495995998383, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.64719927310944, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.82831990718842, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.00640082359314, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.74896001815796, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 117.17712044715881, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.51216053962708, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.34527909755707, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.10383975505829, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.12752103805542, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.36672019958496, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.50336027145386, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 146.86367869377136, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.23535978794098, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.32480013370514, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.21215963363647, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 121.35007977485657, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 116.5939199924469, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 147.432159781456, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 118.19151997566223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 113.4444808959961, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.4072003364563, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 142.2188800573349, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.96032106876373, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.51071846485138, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.96464002132416, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.30400002002716, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.24848008155823, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 141.32464051246643, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.78847980499268, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.49935948848724, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.16751968860626, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.9262398481369, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.87023985385895, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.42592060565948, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.36464059352875, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.4867205619812, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.83535861968994, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.40655982494354, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.41471982002258, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 124.31855976581573, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.9025604724884, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 138.37103962898254, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.44303977489471, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 125.0249594449997, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.8339204788208, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 215.62079906463623, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 957.859525680542, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 215.8238399028778, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 957.7103996276855, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 216.74816131591797, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.9918441772461, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 216.19743824005127, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 310.342880487442, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 220.659362077713, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 311.50832176208496, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 224.53823924064636, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 308.7073600292206, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 220.53872108459473, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 311.4625608921051, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 224.4108808040619, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 309.74640011787415, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 219.55552220344543, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 310.98495960235596, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 223.21711897850037, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 308.4230399131775, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 219.64751839637756, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 311.1513590812683, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 224.38512086868286, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 313.44271898269653, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.63088023662567, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.7019190788269, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.55679941177368, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.4072003364563, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 200.57920217514038, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.54207849502563, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 200.35167932510376, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1742.3779296875, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.2067174911499, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1744.2910289764404, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.79375982284546, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1742.711524963379, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.9486424922943, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1744.2323207855225, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.87616205215454, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.01952052116394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.62031960487366, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.48032069206238, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.61920046806335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 292.09808349609375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 280.1147210597992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 284.5358383655548, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.73935866355896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.8942415714264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.5022382736206, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.53183913230896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.71408128738403, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.57855796813965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 281.1942434310913, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.89647793769836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.0896019935608, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.91872119903564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.92143750190735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.91775965690613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.46143865585327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 293.72976183891296, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 293.0740785598755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 281.610723733902, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.45647621154785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.82255816459656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.9921579360962, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.35407876968384, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.0457639694214, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.85599875450134, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.91423869132996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.71984219551086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.5283179283142, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.64704275131226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.27807807922363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.508159160614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.9947168827057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.11983919143677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.1719994544983, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.00688219070432, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.21440148353577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.6347198486328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.54880023002625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.0759983062744, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.36015939712524, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.635196685791, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.87407779693604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.4654381275177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.40352058410645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.22224044799805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.44975972175598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.95392274856567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.35312247276306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.2020790576935, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.5236814022064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.7295968532562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.6262412071228, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.78448009490967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.9145622253418, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.6259183883667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.08495998382568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.9737591743469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.7124786376953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.457279920578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.2793595790863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.83663868904114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.13760018348694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 349.6822392940521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.5526382923126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 297.20735907554626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 286.7516803741455, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.6828806400299, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.03728008270264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.69167852401733, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.7803204059601, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 340.13248085975647, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.70608043670654, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 306.3313591480255, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 283.1761598587036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.5363199710846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.9785556793213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.56560254096985, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.64272141456604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.59135913848877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.4353623390198, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 299.56159949302673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.5857594013214, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.70063948631287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.99152159690857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.5889608860016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.45600056648254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.4415969848633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.3417594432831, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.79967880249023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 295.44464111328125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.6841607093811, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.87120366096497, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.46335887908936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.45728015899658, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.69248151779175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.59776306152344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.20623898506165, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.37439966201782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.82351922988892, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.10031986236572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.88816046714783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.73376178741455, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.47247910499573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.9795217514038, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.8196816444397, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.325279712677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.05903935432434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.71856093406677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.4265582561493, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.63824009895325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.48495948314667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.89199948310852, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.21136140823364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.273921251297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.92447924613953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.62288284301758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.49440050125122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.66384196281433, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.24335932731628, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.32975792884827, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.1110372543335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.0479965209961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.40528202056885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.74479913711548, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.82207894325256, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.98671865463257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.4832007884979, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.5790386199951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.31536173820496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.48175740242004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.7710394859314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.99440026283264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.50207924842834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.41551899909973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.1222424507141, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.58799982070923, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.72640132904053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.18607878684998, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.2105596065521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.2393605709076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.97424244880676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.96575951576233, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.4475221633911, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.58624362945557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.51439809799194, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.0265612602234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.7398383617401, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.18512082099915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.16031908988953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.16160035133362, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.67487788200376, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.6921582221985, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.73792147636414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.2716784477234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.1812801361084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.38911938667297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.5995206832886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 307.0958375930786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.876318693161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.8156816959381, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.9065592288971, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 319.15056228637695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.56784033775332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.78559970855713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.91088247299194, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 303.17375779151917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.9083209037781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.3523201942444, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 269.7886383533478, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 313.21776032447815, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.73791790008542, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 281.6812777519226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.67519807815552, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.09056115150452, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.99008011817932, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.23855805397034, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.23551988601685, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.90400099754333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.79983925819397, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.69999885559082, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.42991876602173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.66223859786987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.32511854171753, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.9667203426361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.7841601371765, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.87407875061035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.08240222930908, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.35088181495667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.58880019187927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.51728177070618, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.64719653129578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.23487997055054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.01328110694885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.54559922218323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.3521602153778, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.5411217212677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.48640036582947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.79248213768005, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.17456150054932, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.45440101623535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.31743693351746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.96320271492004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.69440007209778, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.26880073547363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.08368062973022, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.31856060028076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.72479915618896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.98784041404724, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.70687818527222, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.71871900558472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.0591995716095, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.6526393890381, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.16304254531863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.5201587677002, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.5073606967926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.42799925804138, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.10608005523682, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.4976007938385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.79184079170227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.6611201763153, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.9363214969635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.62848114967346, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.9859209060669, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.1092803478241, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.82399916648865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.89615988731384, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.1480007171631, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.896320104599, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.4648003578186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.84223985671997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.4737572669983, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 284.8292803764343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.12367725372314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.5462396144867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.26096200942993, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.5062370300293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.85535717010498, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.52207827568054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.00464034080505, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.47824025154114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.8764772415161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.53631949424744, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.367680311203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.0223984718323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.33823943138123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.268159866333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.0889618396759, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.8067181110382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.81071734428406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.4396777153015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.3075180053711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.84320092201233, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.187358379364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.13311767578122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.80271863937375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.39583897590637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.08655858039856, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.8209617137909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.90256071090698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.5795180797577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.66607880592346, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.0385603904724, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.0435209274292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.8940799236298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.3729588985443, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.29280114173892, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.6579215526581, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.80336093902588, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.41727948188782, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.78176021575928, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.95904159545898, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.37631821632385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.4534409046173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.32879877090454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.8732810020447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.06496000289917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.02096128463745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.1769597530365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.95407819747925, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.50239872932434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.561119556427, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.6796782016754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.4238407611847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.3622419834137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.67791867256165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.03183722496033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.99663829803467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.4945592880249, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.8449604511261, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 287.86896109580994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.31184339523315, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.7171187400818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.4345588684082, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.65344047546387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.42144060134885, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.26943850517276, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.95248079299927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 280.48160314559937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.8073606491089, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.88479804992676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.863840341568, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.07840180397034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.32480001449585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 318.296320438385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 281.1646378040314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.11536145210266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.92464232444763, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 333.23952078819275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.41232204437256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.2470405101776, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.9382390975952, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 319.0112018585205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 284.5129609107971, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.53664088249207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.92224311828613, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 338.9891195297241, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.2369611263275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.85183858871463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 269.7323191165924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 325.809121131897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 284.0215992927551, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.96768021583557, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.17887806892395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 333.4542381763458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.36319851875305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.27279806137088, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.1724796295166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 318.25583934783936, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.9756796360016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.6875183582306, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.04703974723816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 324.6737587451935, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 319.8260819911957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.1152012348175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 419.54336047172546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 307.84255862236023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.5800006389618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.23616003990173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 308.19984197616577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 400.4302382469177, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 415.4870367050171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 315.3063988685608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.09984135627747, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.7343990802765, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 315.063841342926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.21135926246643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.1636850833893, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 321.2718403339386, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.9774408340454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.6552016735077, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 306.02863907814026, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 392.4412775039673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.19839882850647, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 300.17647981643677, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.0225579738617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.6119978427887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.1150405406952, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.2388801574707, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.29615950584412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.89968276023865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.54175877571106, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.54176235198975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.50863981246948, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.8007991313934, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.05983996391296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.47103881835938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.85103750228882, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.82559943199158, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.84928178787231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.43408226966858, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.93296027183533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.54960083961487, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.7364799976349, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 238.97456169128418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.04367876052856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.67360281944275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.10319852828977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.76943969726562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.44080185890198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.47872114181519, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.02239799499512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.66575860977173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.80672144889832, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.90288066864014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.3140833377838, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.33375883102417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.84720134735107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.96367835998535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.1340811252594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.88079833984375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.28271985054016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.7140805721283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.33247923851013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.88559937477112, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.92959690093994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.3996813297272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.0416009426117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.35360145568848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.64287972450256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 283.1444811820984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.221120595932, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.9395182132721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.56591749191284, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.5841603279114, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.94032192230225, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.5580816268921, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.8782386779785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.3051176071167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.3062379360199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.2075209617615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.66384172439575, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 279.14639949798584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.3352026939392, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.99887919425964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.5262405872345, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.946560382843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.3987205028534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.03615975379944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 269.5625603199005, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 269.2307209968567, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 299.5020806789398, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.6351993083954, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 334.1220808029175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 311.4311981201172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 306.94656133651733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.89888215065, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 332.91375756263733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 318.68255972862244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 305.2179217338562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.6174385547638, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 324.68144059181213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 307.50240087509155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 302.46543765068054, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.1768000125885, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 326.7999994754791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 313.32687854766846, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.9155192375183, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.112318277359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.65424132347107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.95135807991028, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.82335948944092, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.8308789730072, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.21503901481628, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.98160338401794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.93824172019958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.4609603881836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.35536074638367, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.9515199661255, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.82656073570254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.333758354187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.3955225944519, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.16192078590393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.4726424217224, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.96480178833008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.79151916503906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.4827220439911, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.57679867744446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.49823999404907, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.5929594039917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.00640082359314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.96751880645752, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.54479598999023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.84208130836487, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.13327860832214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.64624071121216, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.0177628993988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.38079929351807, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.2073621749878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.36735606193545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.5886380672455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.56352066993713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.75376057624817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.1084794998169, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.60223841667175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.6502411365509, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.4468812942505, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.03775882720947, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.4503996372223, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.9881603717804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.61855959892273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.05903887748718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.181759595871, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.9526391029358, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.75744104385376, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.7089557647705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 307.1668839454651, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5600032806396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 306.48687958717346, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.3556799888611, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 309.7987174987793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.0430335998535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 304.7865641117096, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.86544203758237, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.17824029922485, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.06832218170166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.1599998474121, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.82112193107605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.24799919128418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.28224182128906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.64639830589294, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.28495693206787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.66912055015564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.73392367362976, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.87312006950378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.76671957969666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.53999948501587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.1921606063843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.95040011405945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.00752067565918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.16256165504458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.0209617614746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.87872266769406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.6051208972931, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.7390389442444, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.25311923027039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.53503966331482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.9148817062378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.4718391895294, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.39551854133606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.84287881851196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.16207909584045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.4056005477905, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.1934413909912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.5265598297119, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.2599997520447, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.70944094657898, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.88944029808047, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.3438386917114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 303.8156771659851, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 258.8860809803009, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.76559948921204, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 331.9489586353302, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.9116792678833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.90032243728638, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.0103991031647, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 305.0382399559021, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 305.0651204586029, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 266.00223898887634, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.1060814857483, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 328.5153615474701, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.91376066207886, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.2166352272034, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.3091206550598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 304.85520124435425, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 308.750718832016, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.677122592926, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 281.07760071754456, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 333.1982409954071, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.9015965461731, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.69200110435486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.76496148109436, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 301.6969621181488, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 305.0539195537567, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.70336055755615, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 278.0448019504547, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 328.02639842033386, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 301.6195213794708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 330.54239869117737, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 326.04080080986023, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.1075246334076, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.969598531723, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.0534417629242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.14287996292114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 333.5860800743103, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 326.30671858787537, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.1630401611328, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.79167914390564, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.1092805862427, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 308.8918387889862, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 339.65872049331665, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 322.49824047088623, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.6551992893219, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.41216015815735, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.408322095871, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 310.00768065452576, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 341.06191754341125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 326.85360074043274, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.3158423900604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.6566393375397, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.7044794559479, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.16608119010925, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.8992009162903, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.1785578727722, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.07280254364014, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.02735900878906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.01648330688477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.1176047325134, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.0009641647339, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.0222396850586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.36479878425598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.6985614299774, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.03632164001465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.51679849624634, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.28448057174685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 276.2667179107666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.8003215789795, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.69983983039856, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.87807965278625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.2582411766052, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.89519906044006, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.10096311569214, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 257.03392028808594, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.64735555648804, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.8367967605591, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.8977611064911, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.3278419971466, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.69487833976746, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 264.6022391319275, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 279.27183985710144, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.77632308006287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.29984068870544, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.9742386341095, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.2351987361908, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.29807662963867, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.14720249176025, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.1451184749603, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 283.33136081695557, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 256.3596796989441, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.55439949035645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.05647897720337, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 317.56208062171936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.6779193878174, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 349.53311800956726, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 283.9254403114319, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 328.9796793460846, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.03280091285706, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.37903928756714, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.36831855773926, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 315.91487884521484, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 295.1248002052307, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.2944006919861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.6284821033478, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 320.6968021392822, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.4179184436798, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.3081605434418, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.4956798553467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.79231977462769, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 255.16111850738525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.3575987815857, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.0916805267334, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.85792088508606, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.7716805934906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.6688003540039, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.910560131073, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.87888169288635, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.35568070411685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.7852802276611, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.56767964363098, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.1105630397797, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.27679896354678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.36527681350708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.58559846878052, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.32384085655212, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.81376194953918, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.10688066482544, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 254.9110412597656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 243.42976093292236, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.25695967674255, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 253.9756810665131, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.64143919944763, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 297.15904116630554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.2817602157593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 295.2950417995453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.0811195373535, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 295.78351974487305, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 291.86432123184204, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.31808042526245, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.034880399704, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.79551696777344, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.22127890586853, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.48528027534485, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.64159870147705, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.0388813018799, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 262.42656111717224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.20639967918396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 265.4167950153351, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.5758419036865, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.62799644470215, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.1078460216522, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.31056451797485, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.94576358795166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 411.25648260116577, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.98624205589294, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.9478392601013, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.14416551589966, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 428.6241555213928, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 381.84320092201233, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.6214380264282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.0388813018799, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 428.0851221084595, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.11967873573303, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.59983706474304, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.97088146209717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.95503854751587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.623681306839, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.19296288490295, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.2398376464844, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.88240218162537, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.4703993797302, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.3279995918274, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.9094362258911, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 431.24783754348755, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.7156789302826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.02800011634827, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.70239877700806, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.4281632900238, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.055522441864, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.9203209877014, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.6140785217285, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.5257587432861, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.0452761650085, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5503988265991, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.0516810417175, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.5406398773193, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.0111994743347, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6547193527222, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.8593578338623, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 306.2324810028076, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 286.6699206829071, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 335.2665615081787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.46751976013184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 278.62751960754395, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 344.0990400314331, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 319.4875192642212, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.9278407096863, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 340.2192008495331, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.33520126342773, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 282.98943758010864, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 344.2724812030792, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 314.9502408504486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.36911940574646, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 336.9374406337738, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 315.5336010456085, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 286.7252838611603, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.9003186225891, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.473760843277, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.367520570755, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 334.7265613079071, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 299.4095981121063, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 274.95967984199524, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 414.65823888778687, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.8539206981659, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.7694363594055, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.9126410484314, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 414.7548806667328, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.22975873947144, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 427.466082572937, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.46880197525024, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.79456090927124, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 298.4494411945343, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.0571210384369, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.117280960083, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.3257601261139, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 300.56959986686707, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.30672001838684, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.8596785068512, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.7779190540314, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 300.81615924835205, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.26879811286926, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.8436770439148, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.302081823349, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 297.9479992389679, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.8054401874542, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.52944111824036, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1227.089433670044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 292.55151867866516, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1205.5846405029297, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 291.9147193431854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.11039352417, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.8095977306366, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1217.0462322235107, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 286.5070390701294, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 888.1289672851562, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.9308776855469, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 886.5963172912598, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.52991771698, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 904.5785570144653, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.4401574134827, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 913.9846467971802, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.3406386375427, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.2273540496826, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.5999975204468, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 893.1643295288086, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.8686375617981, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.8012704849243, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.7043180465698, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 911.9380807876587, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.6737604141235, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 922.434720993042, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.7742404937744, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 966.8311977386475, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.282398223877, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 902.4003148078918, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.1715264320374, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 911.4887952804565, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.7982449531555, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3696.5366554260254, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.8011236190796, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3732.1895599365234, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.22015714645386, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3699.6262168884277, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.662082195282, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3706.461296081543, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.19007635116577, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.9563217163086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.55840396881104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.3790431022644, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.49184131622314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.8371181488037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.13024044036865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.67936658859253, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.29919958114624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.3075189590454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.582558631897, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.28384256362915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.3950409889221, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.87712049484253, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.2438397407532, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.7696056365967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.75872230529785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.2409596443176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.92655992507935, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.6607995033264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.481764793396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.4785614013672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.8153614997864, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.21663761138916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.93391704559326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.1708827018738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.7081604003906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.4312014579773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.1222414970398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.8798394203186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.2542381286621, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.88272428512573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.95919704437256, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.57552576065063, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.070077419281, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.23888540267944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.29759550094604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.48527574539185, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.8566403388977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.7209553718567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.8857612609863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.2379183769226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.0705609321594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.711040019989, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.6214370727539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.99391746520996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.7171139717102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.97216176986694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.5228786468506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.865918636322, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.0990409851074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.3924775123596, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.5243248939514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.476318359375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.3304009437561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.0241599082947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.72800636291504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.6454372406006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.13855934143066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.2596802711487, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.92607593536377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.67551851272583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.6871991157532, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.56575775146484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.3785548210144, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.9990358352661, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.39296531677246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.066400051117, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.92351770401, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5449628829956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8848032951355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.401282787323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.1929602622986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.7252793312073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.19904041290283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.9563179016113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.3228807449341, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.9796743392944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.3915224075317, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.7502398490906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.7787222862244, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.8779225349426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.3395233154297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.1052803993225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.9151916503906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8948850631714, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.8580799102783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.8652801513672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.5963191986084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.16176652908325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.57024002075195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.5827188491821, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.4780769348145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.6632022857666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.9329562187195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.5542378425598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.6558456420898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.94880199432373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.8142399787903, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.0744061470032, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.7159991264343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.7987289428711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.212797164917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.98303508758545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.1723213195801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.4704008102417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.0047974586487, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.35472202301025, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.5628786087036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.4870457649231, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.09455823898315, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.3423991203308, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.0419225692749, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.9107208251953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.50063848495483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 431.0747194290161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.21760272979736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.9652853012085, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.507200717926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.983521938324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.9140787124634, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.6187205314636, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.0563235282898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.42288064956665, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.0275197029114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.83583784103394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.53711891174316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.161283493042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.2231955528259, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.5696039199829, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.108962059021, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.71615505218506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.38416624069214, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.7817621231079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.5774350166321, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.7136034965515, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.8428831100464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.72543573379517, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.66991996765137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.3833613395691, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.08656215667725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.93136405944824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.4388747215271, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.25920009613037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.52256441116333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.084321975708, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.54383850097656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.1487979888916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.73967456817627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.51216411590576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.5579180717468, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.67664527893066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.00431728363037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.1380772590637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.94896268844604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.6292796134949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.11967945098877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.18064069747925, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.40784072875977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.4022407531738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.1137537956238, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.3993601799011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.7697620391846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.1428818702698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.1088018417358, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.4974431991577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.7798466682434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.57647609710693, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9835209846497, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.5660786628723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.7955160140991, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.7660789489746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.9689598083496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.82480096817017, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.4470367431641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.72655391693115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3692812919617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.60896015167236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.3014392852783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.3415951728821, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.30239725112915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.74992275238037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.39855670928955, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.07999420166016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.2233581542969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.44000339508057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.57216024398804, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.8572793006897, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.29952239990234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.784321308136, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.8839955329895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.2393593788147, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.8023953437805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.40768480300903, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.9350414276123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.70128059387207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.34560108184814, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.57952308654785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.24032258987427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.06319522857666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.0708808898926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.7772798538208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.1243152618408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.14335918426514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.3484802246094, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.5487937927246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.39183950424194, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.69983768463135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.4884886741638, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.1195139884949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.1302423477173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.23327255249023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.9713616371155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.87023878097534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.13808393478394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.71343994140625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.7079977989197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.4326367378235, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.6467170715332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.18144035339355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.6388816833496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.1617622375488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.2748794555664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.4883222579956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.4385576248169, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.05888080596924, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.13343477249146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.4577589035034, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.0025644302368, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.91552114486694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.81663751602173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.81791830062866, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.4726357460022, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 767.197916507721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.082079410553, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 786.714243888855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7886385917664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.852481842041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.534722328186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.7598419189453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.1523246765137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.7343945503235, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.99600315093994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.84351873397827, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.9803171157837, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.1148781776428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.311044216156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.8899164199829, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.68144130706787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.30592012405396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.47135972976685, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.0942358970642, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.5675196647644, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.08352184295654, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.49744272232056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.17920207977295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.28320121765137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.6542434692383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.9796848297119, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.86656188964844, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.37951707839966, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.70063638687134, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.04495573043823, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.5516800880432, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.44208240509033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.1054368019104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.99567890167236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.7369589805603, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.05903673171997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.82495975494385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.3484797477722, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.03663635253906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.28352975845337, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.36192178726196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.17983865737915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.0140805244446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.83839750289917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.83168029785156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.1084842681885, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.49359607696533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.62848377227783, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.5153613090515, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.2751979827881, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.63120794296265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.09839391708374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.91007947921753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.05375576019287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.8743968009949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.2927956581116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.1227169036865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.6068768501282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.54928064346313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.59391927719116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.27872133255005, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.84575605392456, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.6030378341675, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.4520010948181, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.05903911590576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.8240032196045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.7820816040039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.8951997756958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9270401000977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.4100775718689, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.1683259010315, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.6390390396118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.0980763435364, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.1552023887634, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.9057626724243, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.018883228302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.23583698272705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.84751892089844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.3276786804199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.6649580001831, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.94271993637085, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.0824017524719, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.14528036117554, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.624963760376, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.932963848114, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.85599422454834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.07487392425537, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.0648012161255, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.6894392967224, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.2630443572998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.29567766189575, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.5257577896118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.36144256591797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.9403233528137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.4174361228943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.61487865448, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.59791803359985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.4643168449402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.6046404838562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.8256068229675, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.37983560562134, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.76239585876465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.8902382850647, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.6880025863647, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.0209550857544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 727.4769568443298, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.9939169883728, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.1294412612915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.8537578582764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.065755367279, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.6790347099304, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.3268814086914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.7532811164856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.6017642021179, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.5286350250244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.3379249572754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.78639793396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.7025604248047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 760.4059147834778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4943985939026, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.1902389526367, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.8574376106262, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.1652812957764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.5695986747742, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.905595779419, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.0817589759827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.8324847221375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 689.0115189552307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.53136444091797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.0795192718506, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.57920026779175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.50063610076904, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.73280477523804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 454.6331238746643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.6124768257141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.79504013061523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.2592034339905, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.07007360458374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.1156802177429, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.1540832519531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.4987201690674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.5198383331299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.9540796279907, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.0764765739441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.64927768707275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.32623958587646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.90144443511963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.75680017471313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.53199768066406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.6596760749817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.64463901519775, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.5447964668274, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.0244812965393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.6344027519226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.3812837600708, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.2035217285156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.9411211013794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.7512001991272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.03040409088135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.6561574935913, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.0724835395813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.797119140625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.79487705230713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.45487642288214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.6020760536194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.81135845184326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.89231395721436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.06767749786377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.3732786178589, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.8678421974182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.5870451927185, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3310375213623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.16400051116943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.76976346969604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.3358373641968, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.9075255393982, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.6555185317993, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.9603171348572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.142080783844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.79616403579706, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.35599279403687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.23088359832764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.0662384033203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.3123264312744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.25360012054443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.5620756149292, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.20159816741943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.69071865081787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.54159784317017, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.4303970336914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.38575553894043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.99744176864624, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5207953453064, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.3891205787659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.4177598953247, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1387257575989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.3819231987, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.8014478683472, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.7635231018066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.9121556282043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.970883846283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 683.9639925956726, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.5492835044861, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.8779239654541, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.4555177688599, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.0505647659302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.4040040969849, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.1529560089111, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.3172812461853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.41279888153076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.99872398376465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.2251172065735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.52207708358765, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.9119944572449, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.9054388999939, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.94224309921265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.6446418762207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.9697642326355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.2275228500366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.8505549430847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.96239948272705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.3331198692322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.59007930755615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.3022389411926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.8209595680237, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.61152029037476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.81071853637695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.35375785827637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.0558409690857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.8596806526184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.2358388900757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.249764919281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.3332781791687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.341121673584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.9678440093994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.4801607131958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.36239767074585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.50975799560547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.04368591308594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.18607902526855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.6376004219055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.5953550338745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.16863918304443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.0497603416443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.24015522003174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.57952308654785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.3806447982788, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.2219228744507, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.00928258895874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.09008026123047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.334077835083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.5054392814636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.3097596168518, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.6617579460144, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.7420792579651, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.54159450531006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 918.4292793273926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.1011204719543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.8967962265015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.6216015815735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 915.5791997909546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.8563194274902, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 949.522876739502, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.990403175354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.8790340423584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.780478477478, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.468638420105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.9583992958069, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.23247957229614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.21184253692627, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.02352380752563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.8542437553406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.990882396698, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.1280016899109, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.9084792137146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.7408013343811, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.29120206832886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.7112030982971, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.6505618095398, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.3475275039673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.54944133758545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 448.9099168777466, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.804160118103, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.77951860427856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.10079860687256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.08528327941895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.9727964401245, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.8804793357849, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.1912021636963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.6355199813843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.6153607368469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.38479709625244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.7116723060608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.3585591316223, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.933916091919, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.7038416862488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.1604833602905, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.399516582489, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.7622413635254, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.7667169570923, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.792311668396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.26416206359863, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.0612840652465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.1763205528259, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.38415813446045, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.4963173866272, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.256959438324, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.0512022972107, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.8968005180359, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.2931170463562, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.3188824653625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.2918357849121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.6201639175415, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.13759899139404, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.5639977455139, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.2959895133972, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.8947200775146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.170560836792, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2878403663635, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.8300733566284, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.7235188484192, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.28463888168335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.5547194480896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3111972808838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.795202255249, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.3476800918579, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.31280183792114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.0838356018066, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.5984034538269, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.5651206970215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.0366430282593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.6552014350891, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.0913577079773, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.4553623199463, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.8739199638367, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.5588803291321, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.2456030845642, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.9768013954163, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.5862407684326, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.1004781723022, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.2425637245178, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.4713606834412, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.7796792984009, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.8694458007812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.1110420227051, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.9463987350464, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.807354927063, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.5708832740784, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.5168023109436, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.2422413825989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.4414420127869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.8063998222351, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.9510402679443, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 867.6070356369019, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 867.175350189209, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 876.607837677002, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 880.0158500671387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.2206449508667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 856.5936088562012, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.6395177841187, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 459.28239822387695, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.6736030578613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.68272161483765, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.80880212783813, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.1408009529114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.2080044746399, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.9041557312012, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.17791748046875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.8457589149475, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.82864809036255, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.9638395309448, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.3984022140503, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.29583501815796, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.7081651687622, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.9934401512146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0163216590882, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 456.99520111083984, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.1779217720032, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.1622462272644, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.7960014343262, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.2519965171814, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 460.47680377960205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.38112020492554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.260639667511, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.0785593986511, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.2048010826111, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.44480419158936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.65999603271484, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.5547204017639, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 458.12479972839355, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.7275195121765, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.0918426513672, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.1025576591492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.9486374855042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.7270374298096, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.3964776992798, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.0095992088318, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.7239999771118, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.3022375106812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.382559299469, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.8540754318237, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.692325592041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.0692849159241, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.1390419006348, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.959520816803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6307163238525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.6734457015991, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7107257843018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.9980821609497, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.27535247802734, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.18943786621094, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.12784004211426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.7944059371948, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.93055868148804, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.1607995033264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.3782424926758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.37776231765747, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.69552659988403, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.3015990257263, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.2847990989685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.3799991607666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.1166386604309, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.84160137176514, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 462.42159605026245, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.3630437850952, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.49312019348145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.51471996307373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.1001687049866, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.26128482818604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.3609666824341, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.0411195755005, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.81823539733887, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.16399765014654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.3020796775818, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2923192977905, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5092759132385, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7312026023865, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.1987190246582, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.5940837860107, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.9623975753784, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.7955207824707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.1281614303589, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.7175970077515, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.3671932220459, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.8595190048218, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.1315155029297, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.7591972351074, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.56207752227783, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 835.6031894683838, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.4662485122681, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.6188836097717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.6275200843811, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.7331228256226, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.6758451461792, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.2902393341064, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.7976050376892, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.5948877334595, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 830.9124898910522, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.8158373832703, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.8008046150208, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 858.5790395736694, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 799.6700763702393, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.0625596046448, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.6270413398743, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.1521549224854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 823.2535982131958, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.7518391609192, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.2433576583862, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 832.1303987503052, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 800.4521584510803, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 722.0651245117188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 720.3507232666016, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 836.7769598960876, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.2064108848572, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.5403218269348, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 721.1760020256042, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 851.9001770019531, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.8185601234436, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.136962890625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 730.6113648414612, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1014.4359922409059, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.2935991287231, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.8619184494017, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 996.4471960067749, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1036.9926309585571, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1004.6833562850952, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1045.6390380859375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 998.192629814148, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.0297622680664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.181914806366, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.3636798858643, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.2863955497742, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.2203235626221, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.9193625450135, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.9127955436707, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.5395121574402, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.615843296051, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.1145558357239, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.5742311477661, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.4755244255066, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.7340793609619, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.1411185264587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.6751928329468, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.7374377250671, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.435516834259, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5256028175354, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.6639986038208, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.5617570877075, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.7862424850464, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.3012833595276, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.8846440315247, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.50416231155396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.8908820152283, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.452962398529, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 785.0726413726807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 676.9409680366516, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 791.2734389305115, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.1209597587585, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.2348818778992, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 684.179515838623, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.5678429603577, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.069598197937, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.568642616272, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.3299198150635, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.6228814125061, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.4126400947571, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.7259168624878, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.2532777786255, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.3606419563293, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.1836829185486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.3316802978516, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.53648042678833, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.3808007240295, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6769647598267, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.5051217079163, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.65295696258545, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2353.8430309295654, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.8521604537964, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2359.606056213379, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.381917476654, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2369.126396179199, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.8824033737183, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2365.816173553467, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.8760004043579, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1705.1086330413818, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1187.0796871185303, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1709.7920036315918, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1207.513279914856, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1706.3822412490845, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1214.5001649856567, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1729.2284870147705, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1204.6752071380615, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1726.354742050171, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1190.2595043182373, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1732.1048164367676, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1186.7190408706665, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1709.7470474243164, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1185.4171133041382, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1713.63920211792, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1217.8872060775757, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1715.4060745239258, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1068.0207920074463, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1724.0322875976562, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1064.9443197250366, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1721.7067241668701, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1049.4910383224487, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1743.8265705108643, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.1158361434937, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6408.664588928223, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 815.3313589096069, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6532.301502227783, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.5982403755188, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6358.906688690186, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.1980857849121, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6353.005447387695, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 16, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 841.2091159820557, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.0852756500244, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.96560621261597, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.0860848426819, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.1376008987427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7555198669434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.1956729888916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.4433612823487, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.44815540313726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.150399684906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.0355176925659, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.43343925476074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.1057562828064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6828765869141, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.20159626007074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.204161643982, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.87119865417486, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.4080033302307, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.6735987663269, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.88768100738525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.0135974884033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7760000228882, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.6187224388122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.01408386230474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.68608140945435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.0433621406555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.7516837120056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.4718384742737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.5723304748535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.9081573486328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.06592035293585, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.4692831039429, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1873579025269, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.12127351760864, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.0223970413208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.32496213912964, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.9419169425965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1529603004456, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.21744441986084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.89936256408697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.1891169548035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.2335991859436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.1750416755676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.9977602958679, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9316725730896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.0199990272522, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.23136472702026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.0340785980224, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.8393588066101, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.8497619628906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.7232003211975, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.32144498825073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.3366394042969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.4559960365295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.1427221298218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3049631118774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.8111987113953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.55679845809937, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.9460768699646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.294397354126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.2887954711914, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.6310377120972, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.7464070320129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7935910224915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.8652720451355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6719999313354, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.0483255386353, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.5270366668701, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 699.3484783172607, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.7481603622437, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.3449606895447, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.8734374046326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 730.3188872337341, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.720639705658, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.4857611656189, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.7827181816101, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.4227209091187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.3163223266602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.9443206787109, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.7758402824402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 724.8345613479614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.0686411857605, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.6054368019104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 699.7318410873413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.5819187164307, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.231517791748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.835994720459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.2327971458435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.839198589325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.8398427963257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.193118095398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.4913573265076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.1529598236084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.0884766578674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1091213226318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.064160823822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.9113621711731, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.58448362350464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.8447952270508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.4459261894226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.73663902282715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.75567674636847, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.56704235076904, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.12464237213135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.7566428184509, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.83055925369257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.57632398605347, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.91792821884155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.99424171447754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.93760347366333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.45055770874023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.68336057662964, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.6025619506836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.051522731781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.7417573928833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.43215703964233, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.4963231086731, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.3262391090393, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.8684763908386, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.40191745758057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.0599961280823, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.69808626174927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.2529630661011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.89280462265015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.1897587776184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.74768114089966, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.26880168914795, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.31295919418335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.19312381744385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.6156873703003, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.93568181991577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.8990378379822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1467208862305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.9643177986145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.08560037612915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.385437965393, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.7334337234497, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.52720308303833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.60815620422363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9214396476746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9135975837708, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.98560380935663, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.01311779022217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.68336677551275, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.9588847160339, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.5678377151489, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.1737632751465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2985548973083, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.5193614959717, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.65968132019043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.51663637161255, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.5499176979065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1532826423645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.87968015670776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.9691219329834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.2126388549805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.8881559371948, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.8816003799439, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.63264179229736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3636770248413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.3644785881042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.1956796646118, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.080952167511, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6624007225037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.5057644844055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.6441602706909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.5587210655212, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9558339118958, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.396960735321, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.2228832244873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.907205581665, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.5827231407166, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.8382411003113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.0884766578674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.6764788627625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7270412445068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.233277797699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.5124821662903, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.7892827987671, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.0086398124695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.16544103622437, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.6416029930115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.19519901275635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.7753577232361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.5175995826721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.1102418899536, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.88047790527344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.1379222869873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.59039878845215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.0420804023743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.64144468307495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.0705590248108, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.71183919906616, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.72800159454346, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.91439628601074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.28063917160034, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.4681577682495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2731213569641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.6806421279907, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.23423957824707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.79680490493774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.79872035980225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.48015785217285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.9545621871948, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.79407596588135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2897605895996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.6164779663086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.4640016555786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.9468755722046, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.4727997779846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.5027174949646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.96047639846796, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.56031942367554, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.48560094833374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.9825611114502, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.60783863067627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.1030402183533, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.4342432022094, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.5916862487793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2044858932495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.70896196365356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.49471616745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.4334387779236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.93199586868286, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.98320150375366, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.86560440063477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.60128116607666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.64447879791265, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.744001865387, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.3035182952881, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.3958430290222, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.44208002090454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.0887985229492, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.906886100769, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.9582347869873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.1014375686646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.1659183502197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.7200040817261, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.0760040283203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 869.6676921844482, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.3676776885986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.78768014907837, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.96239852905273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.2587242126465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.489764213562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.5822443962097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.38720750808716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.8851227760315, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.0804810523987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.99680185317993, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.25840044021606, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.5697536468506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.9663972854614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.95616388320923, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.58015871047974, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.3919973373413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.17039251327515, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.234881401062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.96000051498413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.7324786186218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.10048151016235, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.99952459335327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.0616021156311, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.8316869735718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.04320192337036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.0323238372803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.9233589172363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.7528042793274, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.81664228439325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.3879933357239, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.72768545150757, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.854241847992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.83984327316284, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.6008014678955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.2598400115966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.09023904800415, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.21871566772455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.729278087616, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.27040338516235, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6631994247437, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3028817176819, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.19456005096436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5822310447693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.2023949623108, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.4051175117493, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.744161605835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.3804769515992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.448956489563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2846465110779, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.46095848083496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.48223924636835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.07024240493774, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.55775880813604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.2099237442017, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.2321605682373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2912049293518, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3030366897583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.72896242141724, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.7583951950073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.2696032524109, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.45199728012085, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.465916633606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.80944204330444, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4297552108765, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8574438095093, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.38479423522955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.62896394729614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6576013565063, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.303361415863, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.5648002624512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4457621574402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.4225606918335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.1566371917725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.72463893890387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.38607931137085, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.7014427185059, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.5222392082214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.4579191207886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.5759954452515, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.6089601516724, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.6377558708191, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.1385588645935, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.7558331489563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.7436771392822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.7686376571655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.9998397827148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8911991119385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.7579202651978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.1947197914124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.6977610588074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.45823907852173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.331202507019, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.0756840705872, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.8144016265869, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8078455924988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.0916843414307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.9478406906128, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.9201622009277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.7793612480164, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 826.6424036026001, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.1076788902283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 722.3720026016235, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.0847988128662, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.8398418426514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 803.6212778091431, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 829.3494415283203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.8339185714722, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.9435238838196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.683042049408, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.3646330833435, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 800.8910441398621, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.2955222129822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.8868818283081, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 734.4838428497314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.6835179328918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.7505531311035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 795.7335948944092, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 831.3246440887451, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.8571186065674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 731.4302444458008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.1179141998291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.72704219818115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.22863578796387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.07167768478394, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.3001594543457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.31008195877075, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.32496452331543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.17152070999146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.7907247543335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.2497601509094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.6894392967224, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.1008014678955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.0900812149048, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.82079696655273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.8907175064087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.8904004096985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.81584024429327, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.4513602256775, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.9657621383667, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.19792652130127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.8231997489929, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.09055852890015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.2382426261902, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.87568283081055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.51968050003046, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.94111680984497, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.9056010246277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.03504037857056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.32799768447876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.3825578689575, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.2289652824402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.18752098083496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.8846392631531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.4145579338074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.21376180648804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.9907178878784, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.2952003479004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7233514785767, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.4108853340149, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.8172783851624, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.9878387451172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.2219152450562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.44624423980713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.9891228675842, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1318402290344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4284768104553, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.9953546524047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.807680606842, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.0790395736694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.03600025177, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.6793584823608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.5267171859741, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.8039951324463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.1043186187744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9806389808654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.8462357521057, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.6281590461731, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.8124828338623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.98912191390986, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.9908757209778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.218234539032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.8076710700989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.4571237564087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.9916763305664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.9270415306091, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.6372828483582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.4788789749146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.2102379798889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.878080368042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.3063960075378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.6428799629211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.9772815704346, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.8224067687988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.6457567214966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.9532804489136, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.7401566505432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.6641621589661, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.66304063797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.433919429779, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.3926358222961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.829761505127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.0047998428345, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.7449617385864, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.08128213882446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.51520061492914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.3510456085205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.0348844528198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.78543853759766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.73440551757807, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.5628824234009, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.7692813873291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2723159790039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1662397384643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.6169595718384, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.05583572387695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.82895612716675, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.3371253013611, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.5660762786865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.5953674316406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.06799554824835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.3316817283631, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.16192150115967, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.27135276794434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.666241645813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.54416275024414, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.2276768684387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.17775774002075, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.04383945465094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.5272011756897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.2217574119568, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.9835205078125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.8321557044983, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.53727436065674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.9332790374756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.8271994590759, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.1246418952942, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7857508659363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.40608310699463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.9288001060486, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1668796539307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5225591659546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.7696051597595, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.3644866943359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.249762058258, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.0703997612, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.35423946380615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.4539179801941, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.3895974159241, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4971203804016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1055.4275178909302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.9315228462219, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.2361612319946, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.6897597312927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1055.867838859558, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.4671940803528, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1061.0974502563477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.7521634101868, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.52943992614746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.08495855331427, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.1392059326172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1971244812011, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.5630431175232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.80399799346924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.97360038757324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.8222389221192, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.5934357643127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.1843147277832, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.6988787651062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.0335998535156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.70464277267456, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.7904000282287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.380163192749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.1184000968933, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.10960149765015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.47471618652344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.03136253356934, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.75695562362665, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8500752449036, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.1115159988403, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.5495972633362, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.72704124450684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.0276794433594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.90224170684814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.6412787437439, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.24688196182257, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 699.6337628364563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.5758428573608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.2160019874573, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.3816027641297, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.4918427467346, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.9278435707092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.0838398933411, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.234236240387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.9329586029053, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.7140774726868, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.7951965332031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.8648018836975, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.5775990486145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.6791982650757, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.0948805809021, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.0353574752808, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.3470392227173, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.4479994773865, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.0177612304688, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.5779228210449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1201601028442, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.35280227661127, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4374408721924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.2092814445496, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.455677986145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.1811218261719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4419178962708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.8161606788635, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.9446401596069, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8720011711121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5518345832825, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.1255984306335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.2588815689087, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5081572532654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.2849626541138, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.2387142181396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.9342470169067, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.4647974967957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.7521586418152, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.4576063156128, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.3558435440063, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.9828791618347, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.5409641265869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.6811218261719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.7537617683411, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.9540824890137, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 730.9065628051758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 718.0647993087769, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.320164680481, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.039514541626, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.2228770256042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.3521609306335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.824960231781, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.3262419700623, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.501118183136, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.5208015441895, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.878876209259, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.4704031944275, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.6801552772522, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.2984027862549, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 940.7623863220215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.5958366394043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 953.7668800354004, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 951.1556768417358, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 944.8329639434814, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 963.8323259353638, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.7395153045654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 963.7534379959106, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.2254385948181, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.4683165550232, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.3787198066711, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.4079999923706, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.2459173202515, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.318877696991, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.6484789848328, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.2289581298828, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5846428871155, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.2350363731384, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.6835241317749, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2571206092834, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.5548787117004, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.8451209068299, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.7647972106934, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.2681574821472, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.0959987640381, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.92512702941895, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2835206985474, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1003170013428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.8113651275635, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.850239276886, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.3204736709595, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.534722328186, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2062406539917, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.7150421142578, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.4735984802246, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.7067203521729, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.6193552017212, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.5945534706116, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.5275173187256, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.3984007835388, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.6486411094666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.5971174240112, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 718.0782389640808, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.5731272697449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.7060770988464, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.7814388275146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.1334404945374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.5880002975464, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.5689625740051, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.8631939888, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 718.2345581054688, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.2115187644958, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.0481600761414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.1619215011597, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.7983980178833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.7308821678162, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.82367610931396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.340961933136, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.59951972961426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.330717086792, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.3159999847412, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.0375967025757, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.3964800834656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.955837726593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.5385570526123, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.7988820075989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.8028831481933, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.7795214653015, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.76224279403687, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5102415084839, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.5587229728699, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.95167875289917, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.45408296585083, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.69631719589233, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.9968008995056, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.6489644050598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.63295888900757, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.6289563179016, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.32287883758545, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.4457588195801, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.3753633499146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.6912040710449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.8660802841187, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.6347208023071, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.1766395568848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.0535988807678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.3782396316528, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.756959438324, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.949914932251, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4900794029236, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.9505658149719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.292640209198, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.938717842102, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1531229019165, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.5751953125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.895998954773, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.3916778564453, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 888.7593650817871, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.0353546142578, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 791.7734408378601, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 929.420166015625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 884.3414449691772, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.3147211074829, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 807.7310419082642, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.5884742736816, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 902.4166536331177, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 781.7185640335083, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 789.637439250946, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.4544010162354, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 887.3819255828857, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.3927998542786, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 809.4550561904907, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.191349029541, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 903.2862281799316, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.9759964942932, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.7872009277344, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.6812753677368, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 894.0708780288696, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 784.125759601593, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 787.0670342445374, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 929.6679878234863, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 903.9553642272949, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.4537601470947, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 789.2934417724609, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 930.4092741012573, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 892.2116899490356, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 789.1793632507324, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.3494372367859, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1137.5139141082764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1090.9420776367188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1137.4851179122925, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1101.7759990692139, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1130.6772804260254, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1092.7225637435913, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1142.1615934371948, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1100.2232074737549, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.1915183067322, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.0406422615051, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.7856040000916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.0784091949463, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.0051217079163, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.5140814781189, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.0745568275452, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.4980778694153, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.2331204414368, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.0476822853088, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.6044769287109, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.461594581604, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.7257614135742, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.5364837646484, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.5271973609924, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.1447939872742, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.4283208847046, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.2326436042786, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.593279838562, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.9751996994019, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.8041605949402, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.6540746688843, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.3017597198486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1187200546265, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 864.3463945388794, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.0249629020691, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 862.1806335449219, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.2153587341309, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 869.3523120880127, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.6206398010254, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 868.528323173523, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.3745565414429, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.306236743927, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.1201601028442, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.6222410202026, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.6719989776611, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.098560333252, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.1249570846558, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 689.7984051704407, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.312801361084, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.5449585914612, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.4169569015503, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.5604820251465, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.1217579841614, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.3001565933228, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.9342365264893, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.6128039360046, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.1451215744019, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2486.128807067871, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.3510394096375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2486.2731170654297, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.864963054657, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2493.8580989837646, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.494556427002, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2496.9319820404053, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.5116767883301, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1867.194709777832, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1290.5307245254517, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1900.1716804504395, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1321.0308742523193, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1893.198709487915, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1319.5684814453125, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1890.816307067871, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1333.3575963974, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1875.5905723571777, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1301.571192741394, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1901.9284629821777, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1325.803198814392, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1898.8699054718018, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1305.6180810928345, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1894.7088050842285, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1328.2320070266724, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1886.0566425323486, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.46928024292, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1887.835521697998, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1171.8750476837158, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1886.886568069458, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1172.1923303604126, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1892.0287895202637, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1172.137746810913, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7437.707328796387, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.2992143630981, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7446.624984741211, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.2612819671631, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7449.484100341797, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 951.5705585479736, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7462.863845825195, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 32, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 963.9700746536255, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.4683265686035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.8812799453736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.2039957046509, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.8849558830262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.535041809082, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.6169619560241, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2484784126282, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2203259468079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.99952030181885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.2609548568726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.0217633247376, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.69744157791143, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.3819198608398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.03440093994146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.8193626403809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.5635228157043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.5350422859192, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.783679485321, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.25088262557983, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.3147230148315, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.6255970001221, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.92319965362555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.0958437919617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8707165718079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.5857620239258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.54591274261475, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.5342445373535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.8804788589477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.7443189620972, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.62703847885126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.136960029602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.5027170181274, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.5225553512573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.8708806037903, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.98223972320557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.7809634208679, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.8465633392334, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1214451789856, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.06112241745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.9718308448792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.25199985504156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.18080043792725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.1675229072571, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7388806343079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.5206365585327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7491164207458, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.0622401237488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.7001576423645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.76000070571905, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.197283744812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.6582384109497, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.6271958351135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.5048017501831, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.8639969825745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.4891223907471, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.8833632469177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.92848157882685, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.8481554985046, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.08863735198975, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2897582054138, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.8878445625305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4760003089905, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.6807999610901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.4735999107361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.0507230758667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.1046380996704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.5580816268921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 727.6252770423889, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.7937569618225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.9860825538635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.1772804260254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.8801574707031, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.6022386550903, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.5713601112366, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.6812782287598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.1273603439331, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.0902342796326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.7324805259705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.580798625946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.8555135726929, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.4102449417114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.7283158302307, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.3057560920715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.580958366394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.8023977279663, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.2582440376282, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.271520614624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 756.7788791656494, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.7030410766602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.3348803520203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.9729561805725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 734.891197681427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.9734344482422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.4046382904053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.7936015129089, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 764.5177602767944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3163237571716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.20784091949463, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.74639987945557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.653603553772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.2846422195435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.91664266586304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.6219220161438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.8408017158508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1708860397339, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.87439918518066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.3262367248535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.40255975723267, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9654397964478, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.1000008583069, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.4718360900879, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1510405540466, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8039984703064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.7027220726013, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.2012801170349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.23839950561523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.1929655075073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.3583993911743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.53119945526123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.5385603904724, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8937606811523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.7992067337036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.3260793685913, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.04560232162476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.7900810241699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.5387210845947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.19727754592896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.35712242126465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.2566428184509, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.8076796531677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.5062427520752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.4223985671997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.74303865432734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.8964829444885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.195041179657, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9041628837585, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.346875667572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.742883682251, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2823967933655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.3124761581421, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.0180788040161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.20256042480474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.4777574539185, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.8342337608337, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.34767723083496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.69407510757446, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.0776033401489, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5623970031738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.295841217041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.77536582946783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6928019523621, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7587199211121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.7124800682068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.0947203636169, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.5115194320679, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.4115238189697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.649441242218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.77664709091187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.9239993095398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.5728025436401, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.7089648246765, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.0392031669617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2116832733154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.5271978378296, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.3435215950012, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.3740773200989, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.6324815750122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.4326395988464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.4683208465576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.4478397369385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.2807984352112, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.4932799339294, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.4372835159302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.9358386993408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.9153552055359, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.4440026283264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.95216274261475, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.37168264389044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.95215940475464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.6353597640991, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.2968006134033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.33807945251465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.9230399131775, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.21999740600586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.9783983230591, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5326375961303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.7496008872986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.99999380111694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.1078386306763, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.2721600532532, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.1339225769043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.41776323318476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.6593608856201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.3484783172607, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.18656301498413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.54144525527954, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.8801574707031, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0513668060303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.6087999343872, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.4169602394104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.27823972702026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.0735998153687, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.24592208862305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.59504032135004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.9980845451355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.8708758354187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.70400524139404, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5147199630737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.859034538269, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.0708780288696, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.243043422699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.9550361633301, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.24896287918085, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0473589897155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.9753608703613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.4312000274658, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4337573051453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.00447845458984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.1342363357544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.1036796569824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4849605560303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.468475818634, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4718384742737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.40480041503906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4371123313904, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.7673602104187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.7313566207886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.2281656265259, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.5443224906921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.30847787857056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.067202091217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.25247716903687, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 873.9595222473145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.3214464187622, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.6593608856201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.8228802680969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 878.1020879745483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.2180762290955, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 880.4219150543213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.1100835800171, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.27296209335327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1137628555298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.2151994705201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.0030369758606, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.88592195510864, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.97920179367065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.8961606025696, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.6764802932739, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.596800327301, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.99375581741333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.06112337112427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.85072040557867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.7419214248657, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.77120208740234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.08976078033453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.7332777976989, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.28240728378296, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.6641597747803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.22351980209345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.2849593162536, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.53344202041626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.03599643707275, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.4019227027893, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.38560771942144, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.8268775939941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.10064029693604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.7129611968994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.7127995491028, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.9963202476501, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.41280174255365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.0297555923462, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.87760210037237, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.4785590171814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.77696561813354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.9913630485534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.2907176017761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.331042766571, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.2971234321594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4036812782288, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.7142324447632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.8615975379944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.179039478302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4752068519592, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.0612831115723, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.9600014686584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.4860768318176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8635258674622, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.8441586494446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.6832003593444, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.8817615509034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2287936210632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4427247047424, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.7012791633606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.8084826469421, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.4262442588806, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.8625593185425, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.3200030326843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.30128383636475, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.8518390655518, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5222420692444, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.7392001152039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6697607040405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.3364782333374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.8494448661804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.3870453834534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3788776397705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2649564743042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.3494372367859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.9249663352966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.6672039031982, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.4611210823059, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.7062420845032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.6132788658142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.6665630340576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5395140647888, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.4452862739563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1038355827332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.5643196105957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.0839996337891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.912317276001, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2507171630859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.827995300293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.1527991294861, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.2779188156128, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.7446374893188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.9292783737183, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.7076783180237, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 676.8953657150269, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.4212794303894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8569560050964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.6057634353638, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.5054368972778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.1630387306213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7710385322571, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.9713578224182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.9702415466309, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.8340802192688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 815.1260709762573, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 849.7894334793091, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.2385592460632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 748.7366414070129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.6025590896606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.8425626754761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.4918355941772, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.1739177703857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.3099236488342, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.4212837219238, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 774.86896276474, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.1727957725525, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.2684774398804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.7425603866577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.7608008384705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.967839717865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.8961625099182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.0735983848572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 834.2803192138672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.4587154388428, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.2123212814331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 756.9198393821716, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 784.0535974502563, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.164475440979, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.27888107299805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.1497611999512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5118398666382, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.98335886001587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.20271825790405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.33328008651733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.408317565918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.47759771347046, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.7579164505005, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.1075191497803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.4516825675964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.7830386161805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.10864782333374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.3663954734802, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.5196771621704, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.7567992210388, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.31120014190674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.8830361366272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3494429588318, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.59376192092896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.0660786628723, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.94495820999146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8851180076599, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.38991832733154, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.5830411911011, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.67184019088745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1459274291992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.78336334228516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.93295907974243, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.28384017944336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.3361630439758, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.2728028297424, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.6196827888489, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.5419187545776, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.9947237968445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.6750349998474, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.8462347984314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.8102374076843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.0745620727539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.0027122497559, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4966340065002, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.5771150588989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.0299229621887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.1224040985107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2390370368958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.3196802139282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.1967968940735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.1902418136597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.3068809509277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.1159982681274, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.3673648834229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.8622407913208, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.9343981742859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.4081563949585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.5344014167786, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.392156124115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4068827629089, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.2889575958252, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.9097609519958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.9139218330383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.9636859893799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.0587229728699, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.2937593460083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.913122177124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.0183973312378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.3556823730469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.4774346351624, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.9942388534546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 756.0206389427185, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.5699229240417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.0072040557861, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.5899243354797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.4566445350647, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.6368045806885, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.2313628196716, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.2142376899719, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.5833673477173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.510404586792, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.4208011627197, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.1846413612366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.8795166015625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.7809553146362, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5174403190613, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2390375137329, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.6062397956848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9942407608032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2299199104309, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.2062368392944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.08799934387207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.2209596633911, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.283196926117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.86656570434576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.8977608680725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.5569634437561, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.0595188140869, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.9227194786072, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.745436668396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.797435760498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2140765190125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0139255523681, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.3289575576782, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.31904363632196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.208963394165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.382077217102, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.26863384246826, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.9175972938538, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.4651203155518, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.64863872528076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.02175855636597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.47824239730835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.1793541908264, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.87583684921265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.739360332489, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.9312014579773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.6950354576111, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.1539249420166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.8056025505066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5804824829102, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.349123954773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.3574371337891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.0881609916687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.8896050453186, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.6312022209167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.5390415191651, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3262414932251, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.6239976882935, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.5355200767517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1065.3275203704834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.1688051223755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1065.5456018447876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.2036776542664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1066.7499113082886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.6606431007385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1070.8376026153564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.6713600158691, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.1280026435852, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.5576062202454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.0049605369568, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3812808990479, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.12672090530396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.42480421066284, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.9694356918335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.8934421539307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.0449600219727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9678421020508, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.0359992980957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.1976022720337, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1160001754761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4071955680847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.7804846763611, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.9694423675537, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.7489619255066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.1625638008118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.480637550354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2881603240967, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.7779173851013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.6758422851562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.71952056884766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.9921536445617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.597761631012, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7950415611267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.3737607002258, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.8225626945496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.4260787963867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.4784049987793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.6713557243347, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8142409324646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.1977610588074, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.2214407920837, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.7790384292603, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.812958240509, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.4497566223145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.1145629882812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.3395204544067, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.2824048995972, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.9204816818237, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.5564775466919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7796792984009, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.9665613174438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.3696007728577, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.6177606582642, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.3550367355347, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 683.5390400886536, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.5329623222351, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6483225822449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.0748791694641, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.3558411598206, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.6289596557617, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.991204738617, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.2774410247803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.7950401306152, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.5104055404663, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.3081617355347, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.029914855957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.6945600509644, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.889760017395, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.8529539108276, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.8329563140869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 686.0369610786438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.366231918335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.181921005249, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.2339253425598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.4809565544128, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.0009617805481, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 736.0620784759521, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.1799988746643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 716.3363218307495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.030562877655, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.0123243331909, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 756.300802230835, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.7283225059509, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.6806416511536, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.4948744773865, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.5555191040039, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.2014403343201, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 748.2766389846802, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.362238407135, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.7193627357483, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.5756778717041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.622880935669, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.4641585350037, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.6007976531982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.4745578765869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 967.0486497879028, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 981.858081817627, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 967.1417617797852, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 984.5600080490112, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 977.3425722122192, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 993.8640022277832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 973.7819147109985, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 986.3534450531006, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.4427223205566, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.7827177047729, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.357922077179, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9599962234497, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.480486869812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8219218254089, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.4051179885864, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.8777604103088, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.3100790977478, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.51216173171997, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.5694403648376, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.4828805923462, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.0248031616211, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3713603019714, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.4503998756409, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.5815968513489, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.5576004981995, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.74351882934565, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.2067174911499, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.5990414619446, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.411835193634, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.5153579711914, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.1009631156921, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.5553612709045, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6894435882568, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.0027203559876, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.039840221405, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.4851202964783, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.0019197463989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.6339201927185, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.1155195236206, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.5940766334534, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.7227206230164, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.7873635292053, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.6532821655273, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.4718379974365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.348482131958, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.0403227806091, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.6062397956848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.9921607971191, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.3544011116028, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4743933677673, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 730.7888007164001, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.490879535675, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.1633596420288, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.009759426117, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.9177632331848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.3723225593567, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.5254368782044, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.2932877540588, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.3552031517029, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.6464009284973, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.9086427688599, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.0392036437988, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.6619186401368, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.1876864433289, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.60911846160883, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.2755198478699, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.253764629364, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.6262397766114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.22016048431396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.2065658569336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.056797504425, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.15903711318964, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2193646430969, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.51519918441767, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.41839694976807, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8806443214417, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.24096345901495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.00079870224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.11279296875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.0025644302368, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.5606412887573, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.5956768989563, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.9220786094666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.1953620910645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.1167960166931, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.9855990409851, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.8329563140869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.1136012077332, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.9009585380554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.4697608947754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.7167973518372, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.9011149406433, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.1464066505432, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.9596853256226, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.6259198188782, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.7163186073303, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.8788795471191, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.3723182678223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.2215976715088, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.4881620407104, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 937.4750423431396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 910.9724760055542, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.9476799964905, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 824.2094421386719, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.301760673523, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.9924745559692, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 803.9201641082764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.0571212768555, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 937.4425601959229, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 917.7323198318481, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 807.1347188949585, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.3014450073242, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.6163158416748, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.038722038269, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 807.0151948928833, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.3988752365112, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.3508749008179, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 909.0921545028687, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 812.5785684585571, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 828.8521575927734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 935.1947164535522, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 937.8656005859375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 807.895359992981, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.254716873169, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 934.7740745544434, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 913.4953594207764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 808.8092851638794, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.6372833251953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1162.8575944900513, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1116.4334392547607, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1167.4163103103638, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1125.512957572937, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1170.0452756881714, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1124.5208024978638, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.3548774719238, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1126.8772792816162, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.5449638366699, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.9435224533081, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.5649628639221, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.4172763824463, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.9807939529419, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.181441783905, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.1183996200562, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.6353578567505, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.673436164856, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.9851198196411, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.6353588104248, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.5921626091003, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.7888011932373, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.161762714386, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.8716778755188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.1267204284668, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.9352049827576, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.2806377410889, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.553759098053, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.0364799499512, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.9164791107178, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.9308862686157, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.1262383460999, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.6832041740417, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.9193496704102, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.7996706962585, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.9024000167847, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.686719417572, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 879.5030403137207, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.5804800987244, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.535831451416, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.1651167869568, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 768.022563457489, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.7870440483093, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 701.6942381858826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.4657578468323, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 773.1475162506104, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.4145665168762, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.575680732727, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.0785593986511, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 774.6984004974365, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.0764765739441, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.1355142593384, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.4337606430054, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.0345640182495, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.5566358566284, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.136157989502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.943356513977, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2508.334894180298, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.6436805725098, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2514.455032348633, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.0075182914734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2523.786735534668, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.0100803375244, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2527.8945636749268, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.1134457588196, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1932.8961658477783, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1341.5974426269531, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1942.062873840332, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1347.3326444625854, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1935.0201511383057, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1346.166877746582, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1949.9556732177734, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1365.85120677948, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1933.2398414611816, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1348.529920578003, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1949.1075229644775, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1369.346890449524, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1938.2436752319336, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1351.6355180740356, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1949.699535369873, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1366.9753646850586, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1873.7366390228271, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1179.4961547851562, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1870.349416732788, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1183.728632926941, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1871.7929649353027, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1183.046236038208, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1880.8094501495361, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1183.2462406158447, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7475.730094909668, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 969.1713571548462, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7479.1412353515625, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.0225601196289, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7483.934288024902, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 964.6747159957886, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7496.789436340332, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 64, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 971.7057609558105, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.6713662147522, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.8851232528687, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.6027212142944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.5193600654602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.4046368598938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.1734399795532, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.0147247314453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.209282875061, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.1667170524597, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.7363204956055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.9700798988342, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.7659244537354, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.0740852355957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.6647968292236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.0038404464722, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.0582399368286, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.7942471504211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.1094398498535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.713595867157, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.670560836792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.5830335617065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.1243262290955, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.619038105011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.9561624526978, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.3785552978516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.0180811882019, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.2615966796875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.8390383720398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.2619218826294, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.9067144393921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.0630431175232, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.2393565177917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.4468803405762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.7148795127869, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.1697587966919, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.9476790428162, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.2892804145813, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.7883162498474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.1844792366028, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.7217602729797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.924159526825, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.1017599105835, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.7158427238464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.360643863678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 684.3721628189087, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.601279258728, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.3966360092163, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.652322769165, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.9367957115173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.0846381187439, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.2703990936279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.9737615585327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 684.6873593330383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.2043290138245, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.900803565979, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.6604833602905, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.9254412651062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.320484161377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.006402015686, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 637.566237449646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.8217625617981, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.6489610671997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.5377588272095, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.4278383255005, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.0580821037292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 720.7521629333496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1009.0684795379639, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.0921697616576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.5444717407227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 795.1929616928101, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1114.061918258667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1154.6329593658447, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.7137603759766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 720.7259202003479, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1016.3032007217407, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1034.8028755187988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.8511934280396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.7164750099182, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1117.9559993743896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1153.8620805740356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.6462388038635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.2187232971191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1001.5480089187623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1029.5339107513428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 815.8494400978088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 783.7220740318298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1108.8283157348633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.9406394958496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.6627163887024, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.2159967422485, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1005.9084796905518, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1040.8699131011963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 814.3107175827026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 794.0518426895142, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1116.924638748169, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1147.9395151138306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1267156600952, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.82159948349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.1087980270386, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.0385665893555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.9025611877441, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.1406364440918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.3328008651733, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.7372808456421, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.5174479484558, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.080801486969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.1790399551392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.952962398529, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.1627202033997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.3729600906372, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.1169624328613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.6204733848572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.0316743850708, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.03040599823, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.0777683258057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.1158423423767, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.1169624328613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.734076499939, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.0569581985474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.7198433876038, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.67568063735956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.9483199119568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.3017654418945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.7483205795288, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.5814366340637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8699178695679, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.1345596313477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.4087963104248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.3142371177673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.8596787452698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.5419187545776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.4558439254761, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.9710369110107, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.161759853363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.0516781806946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.2230386734009, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5481648445129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.3644776344299, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.406081199646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.2358388900757, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.8279943466187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.7825598716736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.4593605995178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.5555186271667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.3673620223999, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.1907229423523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.0174431800842, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.470561504364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.4100794792175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.6692814826965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.7447986602783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 663.4708762168884, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.8035154342651, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.7553558349609, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.5339183807373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.3369617462158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.0831990242004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.6193590164185, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.316478729248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.8935956954956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.8748788833618, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.6583957672119, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.2356820106506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.2271971702576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.1395163536072, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.2142362594604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.526713848114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.6163196563721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.042558670044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.1929664611816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.3739161491394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.8544034957886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.5081577301025, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.6859254837036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.7687983512878, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.903196811676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.1617588996887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.0119996070862, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.4231996536255, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.7622404098511, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6711988449097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.9763188362122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.2238402366638, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.5916819572449, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.0360078811646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.9438409805298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.898886680603, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.6342406272888, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7265605926514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.2513599395752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.3668761253357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.036322593689, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.0531187057495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.967200756073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.4104042053223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.6084780693054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.4782409667969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3795185089111, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.6808032989502, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.0079989433289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.4396843910217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.9755210876465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.0873599052429, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.6600017547607, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4667267799377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.7835211753845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.511522769928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.9318370819092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.8947186470032, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.505922794342, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.5931210517883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.5510406494141, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.2048034667969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.8928046226501, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.3248019218445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.7961616516113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.4975981712341, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5804858207703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.3884830474854, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.4143953323364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.9025626182556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.9694428443909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.1732788085938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.5456023216248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.04816198349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.990716457367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.1780805587769, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.9928021430969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.8092789649963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.7881650924683, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.4100842475891, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.0059237480164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1144.3814420700073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.7180852890015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1148.235206604004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.495837688446, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1151.7169618606567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.9108786582947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1154.2894315719604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.6063995361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.2542419433594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.8588809967041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.8124809265137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.4609594345093, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.8584027290344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.3590393066406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.7190327644348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.6623978614807, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.488000869751, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.9628767967224, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.197274684906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.1811203956604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.4787158966064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.7526388168335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.5280017852783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.4454426765442, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.6332874298096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.001437664032, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.0849566459656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.4868807792664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.8779172897339, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.4171242713928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.8990454673767, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.05215883255, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.9527988433838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.4622392654419, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.4916753768921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.3515167236328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.8542385101318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.0782413482666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.0983963012695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.3134436607361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.5363283157348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.1740880012512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.1343951225281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2897567749023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.4091172218323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.1241555213928, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.6756825447083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.6694359779358, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.52176332473755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4608058929443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.0254459381104, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.8531193733215, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.7758402824402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.1686396598816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.019202709198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.7179160118103, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.6121602058411, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0048007965088, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.6948852539062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.2860770225525, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.8935966491699, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.7124810218811, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.154399394989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.7255954742432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.811683177948, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8049640655518, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.7662420272827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.0038361549377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.4740796089172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.032799243927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.7988796234131, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.2593545913696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.94864320755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.9268846511841, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.0091166496277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.4808011054993, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.3547253608704, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.0809607505798, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.5339198112488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.300802230835, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.301281452179, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.756317615509, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.2585587501526, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.8273596763611, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.1979155540466, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.6921591758728, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.1489591598511, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.8887991905212, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.9512014389038, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.1457605361938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.2275233268738, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.5281620025635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.7999978065491, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.8827204704285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.3004789352417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 697.3012828826904, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.7313613891602, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.3812766075134, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.387686252594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.3596749305725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.005443572998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.109920501709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.8323173522949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.567684173584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.312162399292, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.4390487670898, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 871.0507202148438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.6833591461182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 786.16783618927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 797.6203179359436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 650.427520275116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 852.6073598861694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.4502382278442, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.580641746521, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.9768013954163, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 791.4603209495544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.9311981201172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.8056135177612, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 868.8295888900757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.5865564346313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.623366355896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 794.4684815406799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.7107300758362, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 863.1739234924316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.4688024520874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.9454379081726, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 792.3839998245239, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.8836889266968, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.4883232116699, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.893753528595, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.0329637527465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5475211143494, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2343997955322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.5740795135498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1735982894897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.0507230758667, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.6592001914978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.63615894317627, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.43184661865234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.9995174407959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.7878384590149, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.33216381073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.2516784667969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6526412963867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.9236760139465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.9419240951538, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1302423477173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.0460858345032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.3809537887573, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6100778579712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.390073299408, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6424036026001, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.71856021881104, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.370080947876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.663996219635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.4319982528687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.4270358085632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1572771072388, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.6132822036743, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.001763343811, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.0502376556396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.2239990234375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.7252779006958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.8761615753174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.212480545044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.2411203384399, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.6987233161926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.2334442138672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.9057574272156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.7734408378601, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.7116866111755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.1684803962708, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.4588813781738, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.2193641662598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.5753588676453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.8507256507874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.5831990242004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.91583776474, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.0723218917847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.6406421661377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.1825585365295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.177761554718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.8227186203003, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.2686395645142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.3761596679688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.694239616394, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.731207370758, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.8703980445862, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.6918387413025, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.6753611564636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.781277179718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.4887948036194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.0614433288574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.7217597961426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.9131212234497, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.784637928009, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.2627205848694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.5252766609192, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.6939206123352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.7620787620544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.3003196716309, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 774.2734432220459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.3854374885559, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.4878416061401, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.5934376716614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 767.8214430809021, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.0168042182922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.9539203643799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.4361577033997, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.73648166656494, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.8363256454468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.4070334434509, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1123223304749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.7001643180847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1643223762512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.7086415290833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.0047979354858, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.4697570800781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6187181472778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.414234161377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.5281586647034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.5145573616028, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.5441589355469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.1076803207397, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.5953569412231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.3011207580566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.781596660614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.1494436264038, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.1592030525208, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.0659184455872, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.4612803459167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.9817600250244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.356960773468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.6849637031555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.3753528594971, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.2321605682373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.7380752563477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.740475654602, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8348798751831, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.3731231689453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7470369338989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.2252793312073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.5422420501709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.1472010612488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4108815193176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.8107171058655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.8635206222534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.1417646408081, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.7251200675964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.0907163619995, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.9886436462402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.8934426307678, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5759978294373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.6726403236389, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.4998464584351, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.4041676521301, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1076.6366386413574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.8084797859192, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1079.6487951278687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.598714351654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1082.57408618927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.8479986190796, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1088.6448001861572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.7835149765015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.5625591278076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.8921551704407, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.575840473175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.6927995681763, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.4260811805725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.15808248519903, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.065279006958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.5601649284363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.2265601158142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.6760034561157, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.738558769226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.4280014038086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.5155172348022, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.4579191207886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.2415995597839, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.5931262969971, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8719964027405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.5687975883484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.923360824585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.0590362548828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.4257607460022, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.8180747032166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.4412860870361, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.9108805656433, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.6484794616699, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.3580794334412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.2320036888123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.8339157104492, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.9756836891174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.5992031097412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.0547204017639, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.9952020645142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4049587249756, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.0150356292725, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.4175987243652, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.8760032653809, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.882246017456, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.6726403236389, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.9423985481262, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.2865600585938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.254554271698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.7393655776978, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.3375973701477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.9926424026489, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.2015962600708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.0145578384399, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.8340797424316, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.0331211090088, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.14319896698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.2731223106384, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.5272002220154, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.6798377037048, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.8075256347656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.1019263267517, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.7054409980774, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.764479637146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.3817558288574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.2521662712097, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.5632004737854, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 663.4691143035889, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.9612817764282, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.2305612564087, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.7356758117676, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.3982396125793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.8871955871582, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 749.4414401054382, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.564001083374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.2438387870789, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 774.2369604110718, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.3542394638062, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.6750478744507, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.8524780273438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.4099254608154, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.1587224006653, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 771.065924167633, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.6823945045471, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 650.2564787864685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.3151965141296, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.4660768508911, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 759.3793654441833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.9107146263123, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 769.486403465271, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.2800006866455, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.218403339386, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.6131181716919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 758.4811210632324, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.8339152336121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.0292820930481, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 996.4808034896851, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.601279258728, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 992.7102375030518, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.5784053802491, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1005.8182382583618, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.4287977218628, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1006.7497634887694, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.1715202331544, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.5576019287109, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.6020832061768, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.6030349731445, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.2105631828308, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.1219258308411, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.1479992866516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.8887987136841, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.1942348480225, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.7524819374084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.6739249229431, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.3076763153076, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.4382433891296, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.2767949104309, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9227228164673, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.2659244537354, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.2568025588989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.70383644104, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.4332799911499, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.380156993866, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.9524827003479, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.2622423171997, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7368021011353, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.5859236717224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.0787234306335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.0035157203674, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.995840549469, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.1692771911621, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.7068791389465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.1044821739197, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5995163917542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.7801637649536, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.5883226394653, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.4103960990906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.3209552764893, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.2929592132568, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.0465593338013, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.4545550346375, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.3747138977051, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.3230395317078, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.5823993682861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.5529594421387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.3535962104797, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.3708744049072, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.6663956642151, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.5982398986816, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.0641598701477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.6276807785034, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.7433590888977, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6271982192993, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.0913624763489, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9884777069092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8231983184814, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.1590437889099, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.7718362808228, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.0752058029175, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.4724817276001, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.4711985588074, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.7331275939941, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.2932796478271, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.2630400657654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1612801551819, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.3769550323486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.9544053077698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6295971870422, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.9455981254578, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2756862640381, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.4188771247864, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.3984022140503, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.3758397102356, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.0684819221497, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.5692868232727, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4145693778992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.6209635734558, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.2206382751465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.5919995307922, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.1087942123413, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.7662410736084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.675359249115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.3584036827087, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.3766374588013, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.719202041626, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.6025557518005, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.5700764656067, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.8479995727539, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.3670401573181, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.6673564910889, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.5814433097839, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.0843200683594, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 942.0443248748779, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 948.3080005645752, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.94225025177, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 830.9920024871826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.7846536636353, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.841588973999, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.9993591308594, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 828.1028842926025, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.4806480407715, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.6609582901001, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 824.8297595977783, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 834.2294263839722, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.6097621917725, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.8777656555176, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.3409585952759, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.2539157867432, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 943.7436819076538, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 953.3852815628052, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.9041595458984, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 834.291672706604, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.3051252365112, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 935.8153629302979, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.9539127349854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.5467138290405, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 945.5777549743652, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 958.6444711685181, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 828.9371252059937, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 836.2817716598511, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 940.1252841949463, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.4692897796631, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.5091238021851, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 827.9468774795532, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1188.7638425827026, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1139.0116786956787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1192.6312065124512, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1144.9139213562012, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1196.6630458831787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1148.8268852233887, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1199.1324853897095, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1151.2102365493774, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.3561611175537, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.2972812652588, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.8870348930359, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.4310455322266, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.9700808525085, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.2256026268005, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.1779141426086, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.4120001792908, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.2089548110962, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.3401656150818, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.3679986000061, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.8452777862549, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.056960105896, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 663.0862379074097, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.4846396446228, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.0979180335999, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.2030415534973, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.6873650550842, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.217921257019, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.5999975204468, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.6790399551392, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.2347226142883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.4379177093506, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.4911932945251, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 893.0113649368286, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.9203195571899, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 897.3352003097534, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.5142402648926, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 903.1423902511597, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.7889585494995, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 904.4144010543823, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 769.8015999794006, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.8886394500732, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.925440788269, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.5025572776794, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.8607997894287, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.2179217338562, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.1444764137268, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.8600015640259, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.758243560791, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 794.1782355308533, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.2632012367249, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.0107216835022, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.5444741249084, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 796.6763210296631, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.3857679367065, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 716.9395160675049, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.1494383811951, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2539.4411087036133, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.7367973327637, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2552.7148723602295, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.6376004219055, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2568.1479930877686, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.1838355064392, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2573.318395614624, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.6927938461304, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1952.103033065796, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1359.0283298492432, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1958.6380863189697, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1372.5860738754272, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1959.9724960327148, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1364.3467140197754, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1967.141752243042, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1370.417766571045, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1964.930076599121, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1362.9128122329712, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1967.912302017212, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1379.5636749267578, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1965.8495903015137, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1365.1654386520386, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1970.8820724487305, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1378.2129621505737, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1868.5980892181396, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1193.9454507827759, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1866.2722873687744, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1197.3190450668335, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1872.7472019195557, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1199.8555278778076, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1877.0246410369873, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1198.0375957489014, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7500.65071105957, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 968.5006332397461, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7496.341361999512, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.2843151092529, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7503.418045043945, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 981.8358421325684, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7526.106185913086, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 128, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 985.64528465271, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1259.6371126174927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1229.0788793563843, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1267.1767902374268, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1338.947515487671, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1885.2336120605469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1829.1235160827637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1836.0921669006348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1887.8436851501465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1254.809913635254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1235.1792001724243, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1267.8843212127686, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1330.5446434020996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1872.5360107421875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1859.3099308013916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1874.436321258545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1921.0147285461426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1252.7068710327148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1234.2902421951294, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1268.9958429336548, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1333.0531215667725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1888.139820098877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1859.1505718231201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1876.254072189331, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1926.902084350586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1247.8775882720947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1232.8001594543457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1263.5199975967407, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1334.09423828125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1880.45503616333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.8167896270752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1862.0091438293457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1917.6316738128662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1258.6678314208984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1342.611198425293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1482.4416017532349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1720.6641674041748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1856.96928024292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1815.0046348571777, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1933.269920349121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2183.1913566589355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1266.679196357727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1325.452470779419, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1475.4838466644287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1707.745590209961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.832468032837, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1815.1012802124023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1931.3668727874756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2174.868803024292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1266.0057735443115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.5030374526978, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1480.5196857452393, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1712.7619075775146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1853.2225608825684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1816.4598369598389, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1924.2225646972656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2175.711679458618, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1263.549599647522, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1326.150245666504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1473.4009742736816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1709.3239879608154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1847.5390338897705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1811.7511940002441, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1914.3512153625488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2152.9620838165283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1762.9961681365967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1973.5915184020996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2806.9182205200195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2885.530414581299, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2226.8295860290527, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2266.8059253692627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3211.6020488739014, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3295.11137008667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1795.231056213379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1960.2460670471191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2831.8447971343994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2919.1649436950684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2226.222267150879, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2253.9065551757812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3214.8178005218506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3310.1345825195312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1809.108648300171, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1947.1249389648438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2841.1847972869873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2937.2872066497803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2229.363498687744, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2236.3116931915283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3219.3096256256104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3315.979804992676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1795.524492263794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1951.464958190918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2837.598237991333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2925.0332736968994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2229.7761631011963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2237.8563117980957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3218.9145374298096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3316.699962615967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1049.2107200622559, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1071.2718439102173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1108.029751777649, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1229.4878387451172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1170.725440979004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1114.7571182250977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1149.4355154037476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1250.3031921386719, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1021.492476463318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1028.8374376296997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1086.8889474868774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1192.2465658187866, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1185.2846431732178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1101.3500928878784, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1133.2726430892944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1232.7007913589478, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1021.5696096420288, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1029.5265626907349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1080.1041507720947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1189.688959121704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1181.3865518569946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1097.504644393921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1119.9983930587769, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1234.8043298721313, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1015.2828788757324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1029.4039916992188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1082.4987173080444, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1186.7692804336548, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1171.7518424987793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1090.1497650146484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1125.0076913833618, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1230.3971242904663, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1098.7814378738403, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1290.2457666397095, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1674.1484928131104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1704.6199893951416, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1201.496000289917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1266.765432357788, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1693.1745529174805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1749.5990371704102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1083.1881666183472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1251.7452764511108, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1550.8132791519165, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1621.9798517227173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.3945455551147, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1232.2880029678345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1679.4268608093262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1753.7819194793701, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1083.342866897583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1248.0281591415405, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1551.6948699951172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1635.6678581237793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1207.8156757354736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1223.6540842056274, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1672.1687984466553, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1775.3828811645508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1082.9936027526855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1243.1292724609375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1542.17520236969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1629.67679977417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1205.3932809829712, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1218.94287109375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1667.2649574279785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1758.2412910461426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1728.3876705169678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2251.2641620635986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1474.0286493301392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1972.3675155639648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1675.532627105713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2198.624143600464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1452.8283262252808, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1999.5867156982422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1690.782871246338, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2193.932647705078, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1441.0691213607788, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.0332870483398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1685.5443286895752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2194.9545574188232, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1442.394404411316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1977.8499126434326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 970.0470399856567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 997.0609664916992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1072.8740692138672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1248.5355234146118, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1031.5075254440308, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 984.0806341171265, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.7641525268554, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1198.475193977356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 950.4982328414917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.9567937850952, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1019.9463891983031, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1141.3403129577637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1027.8775930404663, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 981.6134405136108, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1007.3108768463135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1205.0265645980835, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 951.2828826904297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 980.0927972793579, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.1099262237549, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1140.5527973175049, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.2049512863159, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.2481517791748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1009.7663974761963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1197.27135181427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 941.0833597183228, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 970.2332782745361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1016.3004875183105, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1138.6312007904053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1021.0259103775023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 974.6134424209595, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1001.2521648406982, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1199.042239189148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1044.4852781295776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1448.9542436599731, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1472.3787260055542, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1119.3969535827637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1284.0489721298218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1319.4139194488525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1030.494885444641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1308.4395265579224, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1311.0385608673096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1094.4731187820435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1207.2350454330444, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1228.1020879745483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1037.1731233596802, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1300.8471965789795, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1303.5825490951538, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1095.0407981872559, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1202.0291137695312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1237.1227169036865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1028.5054445266724, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1298.2577562332153, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1289.831042289734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1085.804796218872, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1204.1727924346924, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1224.199833869934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2830.60001373291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1908.4585571289062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2821.8806552886963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1704.8300552368164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2823.439989089966, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1700.1129627227783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2822.421417236328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1693.81760597229, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1055.8260822296143, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1122.2932720184326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.8988733291626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 994.9108743667603, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.9841585159302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1003.8108825683594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1029.577922821045, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1030.7419157028198, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1022.4097585678101, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.6831941604614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 966.933913230896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.3617658615112, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.6419191360474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.8609580993652, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1015.1004791259766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.5182361602783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 977.7567958831787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.3683137893677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1025.8622407913208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.6502466201782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1003.0964756011962, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 964.716968536377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 967.7892780303955, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.720009803772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1538.6998558044434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1461.3667249679565, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1401.9203281402588, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1329.8638439178467, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1414.4491243362427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1319.8528003692627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1392.3977613449097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1304.7816038131714, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 944.4953536987305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 921.1899089813232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.8500728607178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1027.9425525665283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1248.3094453811646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1144.661283493042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1161.59423828125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1230.2307224273682, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.8316783905029, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 934.6414566040039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 982.3657655715942, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1043.7963247299194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1224.8020887374878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1137.7555227279663, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1173.082389831543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1222.1680116653442, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 953.5150289535522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 942.4519872665405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 981.3996696472168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.058879852295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1221.3108777999878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1138.8548803329468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1172.755208015442, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1220.6654262542725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.6423978805542, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.8123188018799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 983.2894420623779, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1055.212163925171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1218.5444736480713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1145.8080053329468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1168.4464025497437, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1225.6032037734985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1073.4163188934326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1026.2518405914307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.492483139038, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1659.54110622406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1236.7595148086548, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.9014434814453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1295.489592552185, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1745.9214115142822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1094.9641704559326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.546877861023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1164.395203590393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1676.9783973693848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1230.4492855072021, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1154.3604850769043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1276.6979217529297, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1756.736183166504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1098.8934421539307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1039.5990371704102, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1177.2708749771118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1674.8657703399658, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1216.3171243667603, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1154.558401107788, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1281.4539241790771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1770.5393695831299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1098.1537628173828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1038.7574291229248, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.7703981399536, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1675.3443336486816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1240.3358364105225, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1163.2436847686768, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1287.349443435669, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1772.5182437896729, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1483.6267232894897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2058.093433380127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2138.203344345093, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1488.0023956298828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1934.7883033752441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2011.0975837707522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1501.5256023406982, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2074.6416091918945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2159.3063926696777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1523.474555015564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1944.3254375457764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2013.9351940155027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1496.3699197769165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2065.676803588867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2159.945125579834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1523.845772743225, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1939.9772930145264, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2017.0648097991943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1509.3516778945923, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2076.5209579467773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2157.4324703216553, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1532.1574449539185, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1928.3809661865234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2026.8329524993899, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.9905681610107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 782.8438425064087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.8652782440186, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.9108839035034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 912.6281642913818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.8228783607483, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 870.9828805923462, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 918.4670352935791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.9142417907715, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 785.466878414154, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 791.5721607208252, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.8796939849854, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 908.6244821548462, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 840.716962814331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 859.8980808258057, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.2230348587036, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.012638092041, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.0084824562073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 800.4503989219666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.0716829299927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 910.9155225753784, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 830.6192016601562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.8899230957031, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.5508832931519, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 838.1491088867188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.7686381340027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.2316780090332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 857.2512054443359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 916.290397644043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.8497676849365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.4774417877197, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.3064050674438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.7127981185913, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 904.6132898330688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1161.0614252090454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1191.8912029266357, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.157917022705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 903.1121587753296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1200.6313610076904, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1231.362886428833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1025.4160022735596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 912.3614358901978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1153.566074371338, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1203.8488101959229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1051.1552000045776, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 904.2190456390381, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.6700716018677, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1228.632001876831, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.8041677474976, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 906.7972755432129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1155.9976053237915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1208.2734441757202, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.1409721374512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 908.6545562744141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.9870443344116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1229.4998359680176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.619517326355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 907.6220798492432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.5985660552979, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1203.9919996261597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1051.995997428894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 899.6364688873291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1172.9750490188599, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1224.14559841156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1271.17600440979, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1871.4967823028564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1560.0763273239136, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1430.594882965088, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1283.8611221313477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1842.8488159179688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1568.6601543426514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1429.5713520050049, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1279.6668720245361, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1839.9696063995361, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1567.2684717178345, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1429.3268775939941, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1280.3593587875366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1837.193603515625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1569.7999954223633, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1424.4983959197998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.4009618759155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.6667165756226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 884.2187213897705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 877.4310350418091, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.7758502960205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 748.5017585754395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 858.6201667785645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 867.6632070541382, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.961916923523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.5412788391113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 884.1987133026123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 871.1969566345215, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 884.7262334823608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.1065592765808, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 873.3092784881592, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 867.4163246154785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.946400642395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.021915435791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 888.6913728713989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.2224073410034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 885.1534366607666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.3604822158813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 877.9908752441406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.2334442138672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.2582406997681, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.397442817688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.8137636184692, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.7820701599121, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 881.8862390518188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 749.9992036819458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 868.6065578460693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 864.4028854370117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 920.4859161376953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1009.4046401977539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 974.5896053314209, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 964.6912050247192, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 895.863676071167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 994.3841505050659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.8822498321533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.9955263137817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 900.0070285797119, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 996.6476678848267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 991.2620830535889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 953.3940839767456, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 897.6777648925781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1000.6913566589355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 990.504322052002, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 954.9004793167114, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2263.038558959961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1323.576636314392, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2301.4603233337402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1264.331521987915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2302.694854736328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1267.673602104187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2307.870569229126, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1258.8591957092285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 787.6683187484741, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 817.1161603927612, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.6371250152588, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 981.700005531311, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 736.1006379127502, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.3566384315491, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.0086398124695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 817.7670288085938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.150707244873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.6063966751099, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.4759993553162, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.3863978385925, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.251359462738, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.1078481674194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.7395105361938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1034.12832736969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.8644785881042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.936324596405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.0902433395386, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 815.7070446014404, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.5142450332642, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1038.333592414856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 759.4526362419128, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.155993938446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1339.7724771499634, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.555835723877, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1346.8232107162476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 946.7956829071045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1346.158561706543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 943.0287933349609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1353.8382387161255, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 941.4694404602051, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 845.9726428985596, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 775.5967998504639, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 841.9164848327637, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1079.2651176452637, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.840163230896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 870.3110456466675, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.1449527740479, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1197.0763111114502, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.4366397857666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.570240020752, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 856.6934299468994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1082.4855852127075, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.4817638397217, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 880.2841663360596, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 948.8760042190552, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1192.2329568862915, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 870.0771236419678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 813.8033580780029, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 858.876805305481, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1093.7046384811401, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 942.4694442749023, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.0011186599731, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 945.2363300323486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1203.0700874328613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 869.0003156661987, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.8844747543335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.8700838088989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1091.2603187561035, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.47057056427, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 878.6203193664551, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 944.4849634170532, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1203.243522644043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1033.3436679840088, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1207.5406312942505, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1173.7708759307861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1192.347526550293, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1320.9932851791382, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1330.1750421524048, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.3980731964111, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1230.4545593261719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1167.5486421585083, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1196.5935945510864, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1326.1436891555786, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1356.0465621948242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1045.1977586746216, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1235.4233598709106, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1178.8600063323975, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1208.9457607269287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1353.184962272644, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1351.0628843307495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1053.8259315490723, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1235.1115226745605, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1184.0790367126465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1202.9614400863647, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1341.5307140350342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1351.109766960144, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1668.6043167114258, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1701.7046356201172, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1702.029619216919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1759.364309310913, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1714.9470329284668, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1757.2215843200684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1700.3249549865723, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1744.2187118530273, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.0782399177551, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.1671957969666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 817.7572774887085, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 794.7724795341492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.5425634384155, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.604642868042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 896.964168548584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 887.2455978393555, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.5985608100891, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.5459160804749, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.7412910461426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 792.7508807182312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.1131205558777, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.0655989646912, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 887.9617643356323, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 883.8465738296509, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.147358417511, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.3033547401428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 802.4652886390686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 786.8052816390991, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.2721586227417, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 701.9630408287048, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 890.749921798706, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.4563150405884, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.8739199638367, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.4007964134216, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.9575939178467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.403844833374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.8159928321838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.9315156936646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 879.440803527832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.3428821563721, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1108.8305568695068, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.3284816741943, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1241.6048002243042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.4681720733643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1107.2003173828125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 881.5844631195068, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1247.4438428878784, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.5788803100586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1107.2967958450317, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.1123237609863, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1258.9489603042603, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.6195116043091, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1114.2772769927979, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 875.8384037017822, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1256.9084692001343, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.7646503448486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.7620806694031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.1451177597046, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.1793661117554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.5108728408813, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 689.3284797668457, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.8601622581482, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 650.4443168640137, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.124801158905, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.8396754264832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 650.0739216804504, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.7012825012207, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.0360040664673, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.6044821739197, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.7734422683716, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.2761583328247, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.8214454650879, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.6916809082031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.6492805480957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.2700819969177, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.9756808280945, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.0694398880005, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.4041619300842, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.0518345832825, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.7038397789001, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 864.3414306640625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.0956792831421, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 876.8566417694092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.1678419113159, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.2503900527954, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 848.4531307220459, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 885.897912979126, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.7019271850586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 847.0667171478271, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.1308836936951, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.3103971481323, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.1363224983215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 838.4364652633667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 716.8809580802917, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.9019193649292, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.8057599067688, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.3824005126953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1066.811842918396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 906.6174411773682, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 899.2847967147827, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 974.1414451599121, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1033.1201601028442, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 884.8302412033081, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 883.7942409515381, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.6883153915405, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1063.9334392547607, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 912.336163520813, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 893.0531215667725, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 980.0006341934204, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1033.5857677459717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 890.5939102172852, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 878.7131214141846, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 996.466236114502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1067.798252105713, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 921.65696144104, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 902.8363132476807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 982.524471282959, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1036.8795156478882, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 897.1425676345825, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 882.0582437515259, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1003.8440036773682, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1072.005763053894, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.3512048721313, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 904.8819208145142, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 988.6751985549927, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1035.290560722351, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 903.3412790298462, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 886.4716863632202, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1329.116153717041, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1283.4150457382202, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1339.9849557876587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1289.2715072631836, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1342.231035232544, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1291.8219137191772, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1349.4299173355103, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1297.330241203308, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.3017587661743, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.5856022834778, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.9895949363708, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.56720495224, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.6203184127808, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.1225638389587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.2948794364929, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.2297639846802, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.5073585510254, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.5872006416321, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.2993545532227, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.3057541847229, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.454562664032, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 749.0086364746094, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.6479978561401, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.7281637191772, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.5769605636597, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.1873574256897, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.3672018051147, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.7321577072144, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.7396841049194, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.5985593795776, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.3025612831116, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.0387234687805, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 969.3411254882812, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 859.346079826355, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.6574430465698, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.6302471160889, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 967.3595142364502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 857.2313642501831, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 988.6204767227173, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 868.9715194702148, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.5121660232544, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 697.481279373169, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 774.8774480819702, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.3443264961243, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 858.4241580963135, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.9556841850281, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 781.6708827018738, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.3571176528931, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.6891202926636, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.0371189117432, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 784.1585612297058, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.6140828132629, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.3204832077026, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 721.0715198516846, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 796.3966393470764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.4879965782166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2717.9745769500732, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.3230409622192, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2699.128303527832, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.7743978500366, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2709.370880126953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.9462385177612, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2731.365451812744, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.8323197364807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2076.4281463623047, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1457.1595239639282, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2097.941131591797, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1478.0262327194214, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2083.8921642303467, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1455.8040046691895, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2107.654552459717, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1482.8478288650513, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2091.898708343506, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1463.9102411270142, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2113.7078285217285, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1491.2033605575562, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2095.1006412506104, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1478.0684804916382, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2118.473119735718, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1499.1064023971558, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1905.901107788086, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1274.986081123352, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1906.136178970337, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1281.6697692871094, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1920.943193435669, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1285.9129619598389, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1945.6445026397705, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1290.278401374817, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7693.116188049316, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1036.5203142166138, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7675.943145751953, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1045.0107145309448, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7711.6047286987305, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1043.6899137496948, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7772.8369140625, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 512, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1056.410574913025, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3398.1664276123047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3375.6281661987305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3467.9852867126465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3659.65389251709, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4979.245738983154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4937.603549957275, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4985.6013107299805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5119.582862854004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3359.736328125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3379.3026161193848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3461.6123008728027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3615.9483337402344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5002.736167907715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5040.7399559021, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5092.132034301758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5243.2403564453125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3316.3067054748535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3369.159393310547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3449.480667114258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3598.1529426574707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5027.1360206604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5043.435821533203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5120.320644378662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5268.351173400879, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3298.319206237793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3358.3537673950195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3430.9513664245605, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3578.1862449645996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5025.854225158691, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5059.779815673828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5117.811489105225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5271.095542907715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3483.228645324707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3669.793472290039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4052.63614654541, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4730.128307342529, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4969.440116882324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4904.541282653809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5213.0596923828125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5982.516326904297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3422.6590156555176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3600.3073501586914, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3924.7011375427246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4625.71403503418, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4952.659015655518, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4925.298900604248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5185.362358093262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5933.870868682861, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3395.7278442382812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3580.5932998657227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3956.465129852295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4595.521926879883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4958.760833740234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4938.946552276611, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5199.3218994140625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5954.500961303711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3378.9064407348633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3558.86287689209, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3929.863815307617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4564.4440269470215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4956.67293548584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4910.14892578125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5218.565444946289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5984.881420135498, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4829.424667358398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5433.060321807861, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7806.777114868164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8017.152519226075, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6049.528961181641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6209.582901000977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8967.93815612793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9176.490364074707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4776.638412475586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5285.107326507568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7859.498329162598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8068.514213562011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6004.361743927002, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6197.78959274292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8971.304893493652, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9194.435691833496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4811.991539001465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5260.808944702148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7882.068824768066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8095.280799865723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6053.629627227783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6205.10046005249, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8991.76399230957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9224.360389709473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4807.774562835693, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5241.690616607666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7886.771087646484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8113.81248474121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6043.034381866455, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6202.785606384277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9000.541343688965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9221.157264709473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2861.185464859009, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2899.6649742126465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3039.9782371520996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3371.622085571289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3189.772663116455, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3074.2504024505615, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3152.0294284820557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3387.3132133483887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2674.692335128784, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2741.5833473205566, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2873.1006240844727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3182.318878173828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3165.7116985321045, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2923.6572647094727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2999.93070602417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3241.220169067383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2631.664161682129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2701.073589324951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2865.827522277832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3164.26176071167, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3143.0513763427734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2881.092004776001, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2998.9297771453857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3238.419075012207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2610.135660171509, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2679.1004943847656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2848.853931427002, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3141.5671825408936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3113.8391971588135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2874.32222366333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2992.4087810516357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3231.563186645508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2998.3241748809814, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3538.88126373291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4541.369152069092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4591.915645599365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3285.662250518799, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3416.860828399658, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4514.179973602295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4756.4812660217285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2927.3929595947266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3329.757614135742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4065.9684944152837, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4292.513599395752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3211.86767578125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3192.1806144714355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4474.955062866211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4675.568618774414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2908.5465717315674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3317.319164276123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4074.192638397217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4305.4304122924805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3193.121757507324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3178.0118560791016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4489.347724914551, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4689.27282333374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2889.308786392212, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3298.177604675293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4065.5955123901367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4299.972667694092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3173.1617736816406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3168.003349304199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4484.960670471191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4688.771991729736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4707.722549438477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6129.013919830322, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3971.7742919921875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5325.346088409424, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4491.353759765625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6070.331707000732, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3809.7813034057617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5318.385791778564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4495.207462310791, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6050.131034851074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3848.9816665649414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5319.352264404297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4480.509128570557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6053.228282928467, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3830.306911468506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5331.415042877197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2645.3375911712646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2740.9164905548096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2942.057590484619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3396.282215118408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2791.8868732452393, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2684.8777770996094, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2787.807502746582, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3316.223030090332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2545.5401611328125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2587.7246475219727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2703.953561782837, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2969.3715286254883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2682.7446365356445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2565.7172775268555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2704.6086502075195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3180.517120361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2505.807695388794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2560.28302192688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2683.8115215301514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2972.8806495666504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2659.173765182495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2542.5372886657715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2683.1230449676514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3173.1118488311768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2492.860326766968, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2540.917615890503, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2672.7654552459717, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2966.6897583007812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2651.095190048218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2526.7064094543457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2670.849094390869, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3172.3654556274414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2822.297592163086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4021.46240234375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4006.723804473877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2985.1467418670654, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3496.418914794922, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3583.0467224121094, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2690.296154022217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3275.2033615112305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3365.4108810424805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2827.7331161499023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3144.350709915161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3245.333938598633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2664.6996688842773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3274.598560333252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3357.90225982666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2823.8379096984863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3141.2028789520264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3239.948310852051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2656.415672302246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3273.0588912963867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3344.326515197754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2819.5796871185303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3142.66752243042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3245.0380897521973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7468.764305114746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5136.006107330322, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7173.5515213012695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4534.069080352783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7207.119598388672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4524.626083374023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7209.914016723633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4528.137454986572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2775.597610473633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2904.4750022888184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3118.497905731201, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2689.8905754089355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2620.643539428711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2736.3576126098633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2605.3681564331055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2575.7755088806152, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2597.467851638794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2484.5435333251953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2382.88911819458, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2440.0817489624023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2588.324022293091, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2577.5216102600098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2589.388647079468, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2428.8852882385254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2383.675193786621, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2429.655990600586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2588.593759536743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2573.782091140747, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2580.9030532836914, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2406.85152053833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2379.979200363159, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2426.647367477417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3961.3167762756348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3724.070415496826, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3542.637462615967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3252.7791786193848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3569.930839538574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3279.5262145996094, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3582.5556564331055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3281.7317962646484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2374.9667358398438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2363.015537261963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2469.046573638916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2636.2950801849365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2933.1019020080566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2786.2177658081055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2857.806558609009, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2981.432809829712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2458.396472930908, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2414.6208000183105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2538.7096214294434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2695.8969402313232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2894.0816020965576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2809.7070503234863, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2849.927349090576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2975.6071949005127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2468.083028793335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2424.6228790283203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2539.232635498047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2698.7817764282227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2902.682867050171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2804.1208171844482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2862.282419204712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2978.3506965637207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2473.724822998047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2419.9643230438232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2534.3817615509033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2698.1713676452637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2899.4129753112793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2807.589912414551, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2855.5313682556152, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2981.5920066833496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2663.2795238494873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2625.1976203918457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2965.526885986328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4074.348964691162, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2983.815870285034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2827.1772956848145, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3149.468011856079, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4402.744312286377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2756.571521759033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2653.8516807556152, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2971.174077987671, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4121.101474761963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3044.738073348999, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2848.9119720458984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3129.304962158203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4454.999847412109, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2760.1497554779053, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2649.9431800842285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2973.3580684661865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4139.254055023193, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3044.8459148406982, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2856.2953662872314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3123.3929443359375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4458.900909423828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2767.0831966400146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2652.226400375366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2967.517442703247, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4114.990711212158, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3038.4968090057373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2857.829761505127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3135.0531005859375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4468.111057281494, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3609.7054481506348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5193.585109710693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5422.665119171143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3643.6180877685547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4903.718891143799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5064.913959503174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3677.795524597168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5217.720584869385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5441.573429107666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3727.468032836914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4881.699199676514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5077.261753082275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3680.0318336486816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5218.181610107422, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5454.44128036499, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3750.2574729919434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4878.147830963135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5091.23104095459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3680.569267272949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5216.019382476807, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5455.9124755859375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3753.722038269043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4876.273937225342, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5101.654090881348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2073.2859230041504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1901.355218887329, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1954.589605331421, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2138.2076930999756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2206.1382484436035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2030.9395313262942, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2087.3702430725098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2266.5113735198975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2087.238712310791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1904.84769821167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1958.400478363037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2128.4636878967285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2216.427354812622, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2033.6316871643066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2086.618871688843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2273.0257987976074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2082.1815967559814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1904.2384147644043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1954.8950290679932, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2130.430564880371, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2203.6538982391357, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2035.9676837921143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2082.168016433716, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2284.886884689331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2079.059371948242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1902.9017639160156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1953.128957748413, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2125.4708862304688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2205.1587295532227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2034.4441604614256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2079.591999053955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2274.364004135132, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2535.150566101074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2259.431505203247, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2845.5051136016846, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2914.4007873535156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2463.1679821014404, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2234.590082168579, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2912.757921218872, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3021.9435024261475, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2569.222402572632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2258.2121562957764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2799.4305419921875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2899.363832473755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2474.388484954834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2201.64927482605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2873.486557006836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2988.3344078063965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2560.309133529663, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2258.278570175171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2794.201774597168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2906.28849029541, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2477.816162109375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2197.915687561035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2874.9288177490234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2992.1313762664795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2554.886713027954, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2246.7395210266113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2799.214868545532, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2908.0331134796143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2491.225748062134, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2195.72735786438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2878.4177589416504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2991.6307163238525, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3188.3969688415527, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4585.373268127441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3781.713581085205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3472.1510696411133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3110.698719024658, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4583.545951843262, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3819.6694374084473, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3483.972969055176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3115.3204822540283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4617.675189971924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3830.5409622192383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3495.5118560791016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3122.009925842285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4619.451847076416, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3831.1009216308594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3493.672504425049, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1774.6553707122803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1732.4222660064697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2023.433427810669, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2105.2148723602295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1926.328945159912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1747.3659229278564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2028.6406517028809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2068.9443016052246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1777.6380729675293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1697.9140949249268, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2001.6564655303957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2006.9626998901367, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.6582489013672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1727.123498916626, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2048.303689956665, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2064.1391944885254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1765.5036926269531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1697.0948791503906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1996.7046356201172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2014.450874328613, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1986.7118167877197, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1724.1163158416748, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2043.101444244385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2070.168466567993, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1767.9752159118652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1690.178565979004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2001.7927932739258, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2017.6481437683105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1986.7012786865234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1716.0296058654785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2056.33056640625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2071.194849014282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2280.185432434082, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2381.10463142395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2327.71183013916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2345.991849899292, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2227.5305461883545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2355.4878520965576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2318.7060832977295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2318.8564682006836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2210.6049728393555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2354.4163131713867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2321.523332595825, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2322.78431892395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2207.3196983337402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2352.5390243530273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2321.721782684326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2322.556962966919, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5269.579048156738, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3226.089630126953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5280.382270812988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2994.468011856079, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5306.164970397949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2998.841257095337, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5340.772190093994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3000.4952144622803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1736.9003009796143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1817.867841720581, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.2942428588867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2204.0719985961914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1571.3019180297852, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1676.655511856079, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1669.9276638031006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1778.340015411377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1801.4459037780762, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2299.9059009552, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1633.461594581604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1641.5945720672607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1657.5297832489014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1788.0214500427246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1803.341121673584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2307.8664016723633, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1628.8083171844482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1653.2007884979248, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1656.5063953399658, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1783.9534282684326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1796.2208080291748, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2300.9156608581543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1624.0582466125488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1649.1795063018799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3047.9569721221924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2316.324167251587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2988.1812858581543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2133.7673664093018, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2990.3555488586426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2142.683343887329, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3006.4640140533447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2144.823989868164, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1968.0879878997803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1843.1980800628662, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1965.6065464019775, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2297.1195220947266, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2031.2476921081543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1970.7147216796875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2077.2993659973145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2520.996160507202, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2059.16880607605, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1986.493787765503, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2088.6003398895264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2350.47438621521, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2114.6878242492676, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2054.633913040161, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2168.411512374878, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2555.338888168335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2078.6731338500977, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1988.6934280395508, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2088.3039951324463, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2353.251190185547, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2109.0505695343018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2056.8019104003906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2167.594585418701, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2577.12495803833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2084.486885070801, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1988.7305450439453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2086.6374492645264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2370.1675128936768, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2110.013608932495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2056.52174949646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2166.485776901245, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2594.452476501465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2260.727834701538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2539.962863922119, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2508.5123252868652, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2491.8990516662598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2810.4096031188965, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2869.3971157073975, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2418.5827255249023, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2629.3460750579834, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2641.023349761963, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2531.2494373321533, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2830.636339187622, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2872.23087310791, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2423.986883163452, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2645.1630306243896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2639.1614151000977, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2532.3561477661133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2830.993137359619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2881.0789012908936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2424.3142414093018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2679.1438388824463, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2639.4916915893555, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2547.5068759918213, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2849.4545555114746, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2885.873441696167, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3568.21439743042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3626.3113594055176, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3677.0099449157715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3907.2679710388184, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3716.8251419067383, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3905.219192504883, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3793.213596343994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3907.347011566162, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1440.1521587371826, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1458.5476779937744, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1645.1446533203125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1683.3689403533936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1552.6040029525757, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1485.0561618804932, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1873.8601684570312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1888.633918762207, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1471.478238105774, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1467.3521614074707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1641.1303901672363, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1640.5348873138428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1581.9297647476196, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1483.4915208816528, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1885.3987216949463, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1892.7788829803467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1466.513442993164, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1466.6603136062622, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1645.2782154083252, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1642.2620868682861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1580.8089590072632, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1478.3070468902588, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1876.915683746338, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1898.0444622039795, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1468.7383937835693, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1467.1684789657593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1654.6161556243896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1647.7076816558838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1579.432315826416, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1474.7764825820923, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1881.3409423828125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1901.9969367980957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2239.4564723968506, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1791.3531303405762, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2586.477117538452, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2057.7115058898926, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2309.9473571777344, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1789.504976272583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2661.491184234619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2058.5496044158936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2323.161449432373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1791.2844944000244, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2663.17120552063, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2063.3236694335938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2339.2657375335693, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1804.5136070251465, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2669.210557937622, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2065.5083179473877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1232.027039527893, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1318.1190490722656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1317.3673486709595, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1269.2607975006104, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1309.6385526657104, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1291.9852876663208, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1257.6239919662476, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1320.0656032562256, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1274.7187089920044, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1288.316798210144, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1316.1670446395874, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1253.4862327575684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1251.3371229171753, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1314.6345663070679, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1285.5884838104248, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1283.6652755737305, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1318.3768033981323, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1259.4539213180542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1253.532633781433, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1326.0905504226685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1289.2579221725464, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1284.3487977981567, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1318.4723281860352, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1264.8073625564575, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1751.1100769042969, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1707.0840072631836, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1777.3012828826904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1719.8964881896973, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1778.2929801940918, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1721.4891147613525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1786.482572555542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1726.038408279419, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1579.8164749145508, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1382.9824018478394, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1618.8468837738037, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1412.4438428878784, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1625.344476699829, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1417.297601699829, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1639.138422012329, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1414.4838380813599, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1911.3032245635986, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1967.8265571594238, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1752.2313499450684, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1803.6036777496338, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1897.3329639434814, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1926.4302253723145, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1710.7283210754395, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1766.4305400848389, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1995.8062553405762, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2069.9638271331787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1959.3830299377441, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1979.8966217041016, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1975.342903137207, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2003.9855957031252, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1900.9603214263916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1920.981912612915, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1997.518720626831, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2086.5457725524902, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1958.3865642547607, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1982.981767654419, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.9335842132568, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2009.5001602172854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1893.1991863250732, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1918.327522277832, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2006.7752265930174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2093.8532733917236, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1958.1025695800781, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1985.1593494415283, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1992.9534530639648, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2025.5969619750974, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1896.0628700256348, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1918.2876777648926, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2485.310583114624, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2392.2119998931885, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2708.517904281616, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2613.7056064605713, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2739.5281505584717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2627.535991668701, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2749.733934402466, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2627.3576068878174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1312.4516868591309, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1371.1948776245117, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1270.139832496643, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1263.5889625549316, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1305.927677154541, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.359040260315, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1343.798713684082, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1385.2571296691895, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1310.3020858764648, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1291.3950490951538, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.0172834396362, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1247.3668766021729, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1345.3529596328735, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1385.2844858169556, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1319.5417547225952, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1294.631519317627, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1328.6287927627563, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1260.3399991989136, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1364.982409477234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1405.8966398239136, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.6868772506714, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1312.0347356796265, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1341.5497636795044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1269.441270828247, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1848.6526489257812, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1629.4103908538818, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1871.0152053833008, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1676.5619087219238, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1874.618215560913, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1676.3841533660889, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1894.251365661621, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1689.7099018096924, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1476.5927982330322, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1110.1559972763062, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1330.5219173431396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1004.140796661377, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1501.026725769043, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1125.6663942337036, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1354.266881942749, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1019.5953702926637, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1508.3126401901245, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1133.9270496368408, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1352.785120010376, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1026.3347148895264, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1534.6483182907104, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1166.9036865234375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1380.561923980713, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1042.5006341934204, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5522.248821258545, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1113.4160041809082, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5470.490398406982, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1125.6692743301392, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5484.120445251465, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1136.4676904678345, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5520.324821472168, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1159.1534423828125, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2378.0892753601074, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1687.1630477905273, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2392.27313041687, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1698.1086158752441, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2512.5012588500977, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1811.3804912567139, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2538.1628704071045, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1804.7070598602295, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2522.376136779785, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1819.1190338134766, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2555.8112239837646, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1805.8307266235352, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2550.4505825042725, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.9092769622803, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2573.529920578003, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1825.0847816467285, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2023.0862617492678, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1507.276315689087, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2053.344955444336, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1537.781286239624, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2122.2268676757812, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1551.8648052215576, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2237.440004348755, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1632.534580230713, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8713.604316711426, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1220.901608467102, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8574.843406677246, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1244.5124912261963, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8596.102027893066, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1268.5089540481567, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8652.186546325684, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1536, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1299.2396926879883, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6719.045829772949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6646.787338256836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6822.736511230469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7198.8043212890625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9677.865180969238, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9703.671226501465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9818.126602172852, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10076.600303649902, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6609.2461013793945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6655.421257019043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6807.887229919434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7085.511932373047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9732.455673217773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9927.391128540039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10030.67813873291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10281.354789733887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6485.452919006348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6611.485710144043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6747.314453125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6970.997009277344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9748.673858642578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9867.727699279785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9958.968276977539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10270.403823852539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6382.735500335693, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6559.672164916992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6699.045562744141, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6900.298919677734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9748.525924682617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9864.921913146973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9995.019302368164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10280.852813720703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6825.440444946289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7223.251495361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7947.279357910156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9303.040809631348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9681.023712158203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9596.994743347168, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10191.981315612793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11701.329498291016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6696.771507263184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7077.397804260254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7618.75057220459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9020.933380126953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9724.002075195312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9601.309242248535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10163.078880310059, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11606.125602722168, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6585.039100646973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6958.767623901367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7512.055206298828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8863.439254760742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9687.592086791992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9602.247543334961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10194.06509399414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11706.768798828125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6529.177284240723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6899.077339172363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7474.810562133789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8827.585792541504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9687.701606750488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9602.64144897461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10169.466819763184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11735.445899963379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9477.139663696289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10695.657348632812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15290.210266113281, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15674.352188110352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11773.2661819458, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12233.904457092285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17582.018432617188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17966.37222290039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9243.216171264648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10297.956352233887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15336.405029296875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15736.24641418457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11684.675407409668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12215.528450012207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17559.83039855957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17977.873992919922, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9164.625511169434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10182.287864685059, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15369.207153320312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15773.988800048828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11736.568374633789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12148.052673339844, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17595.061569213867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 18015.470428466797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9149.334564208984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10125.5904006958, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15400.078659057617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15822.32666015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11784.226951599121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12156.501388549805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17620.61851501465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 18044.183044433594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5675.832767486572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5698.177127838135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5975.10383605957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6618.336334228516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6171.899662017822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6066.244468688965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6220.200786590576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6645.095367431641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5273.587017059326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5402.191505432129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5646.9794845581055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6251.028804779053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6061.439208984375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5777.230854034424, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5890.69356918335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6332.887668609619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5129.495010375977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5286.825923919678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5573.205261230469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6154.095039367676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5995.698890686035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5721.798915863037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5908.659362792969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6315.314407348633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5041.260147094727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5259.460315704346, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5519.79362487793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6125.026073455811, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5976.546192169189, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5714.399166107178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5892.145481109619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6302.975978851318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5894.132957458496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6954.125137329102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8886.41887664795, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8952.797317504883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6392.960987091064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6671.162910461426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8813.133926391602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9337.667121887207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5698.531799316406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6447.576484680176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7924.637184143066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8369.136543273926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6173.743152618408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6264.206714630127, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8743.07674407959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9117.575721740723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5611.920680999756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6356.739044189453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7934.805641174316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8377.519340515137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6116.063995361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6204.092330932617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8743.186950683594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9119.889526367188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5566.061267852783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6338.58283996582, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7934.606513977051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8394.662551879883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6071.737442016602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6201.879234313965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8753.512344360352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9142.232971191406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9188.326034545898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12029.552192687988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7759.568176269531, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10425.077896118164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8711.600723266602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11884.332962036133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7467.669868469238, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10386.559562683105, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8649.6439743042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11911.956939697266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7401.120491027832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10385.019073486328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8624.097480773926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11920.186614990234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7429.005966186523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10401.572341918945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5242.619190216064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5362.594890594482, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5777.8839683532715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6670.206451416016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5444.042701721191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5267.865428924561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5436.658191680908, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6524.570274353027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4925.76530456543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4999.675884246826, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5297.648010253906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5717.866916656494, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5186.768817901611, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5078.205165863037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5308.9606285095215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6162.939872741699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4826.736145019531, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4905.82498550415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5236.624011993408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5716.376152038574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5110.968036651611, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5017.948169708252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5274.105796813965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6174.01424407959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4792.508163452148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4855.534687042236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5203.804988861084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5710.014667510986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5096.951522827148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4953.240985870361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5224.959354400635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6158.796844482422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5487.086582183838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7942.040672302246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7822.897109985352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5868.934917449951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6912.585296630859, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6989.776382446289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5174.915199279785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6361.993618011475, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6542.569770812988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5491.790885925293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6112.320308685303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6322.597770690918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5086.477298736572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6363.957786560059, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6526.952133178711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5421.695194244385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6107.9155349731445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6322.765789031982, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5067.203693389893, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6360.346431732178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6526.633529663086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5434.59924697876, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6110.4375648498535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6325.835647583008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14440.595092773438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10029.375038146973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13654.854125976562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8831.327857971191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13490.739364624023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8784.872779846191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13635.31265258789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8793.098983764648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5438.973579406738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5876.558570861816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6112.789287567139, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5267.197914123535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5274.315223693848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5352.274875640869, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5036.7919921875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4881.283855438232, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4955.075969696045, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4842.444686889648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4561.748313903809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4639.947700500488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4948.326072692871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4874.974060058594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4946.101150512695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4721.37565612793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4536.0846519470215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4649.232139587402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4942.004203796387, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4873.052978515625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4942.622852325439, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4680.519638061523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4532.869606018066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4647.036476135254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7762.73006439209, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7243.764915466309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6887.067565917969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6274.78572845459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6919.398880004883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6312.515525817871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6959.618301391602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6354.217758178711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4622.491874694824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4635.30553817749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4815.625591278076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5102.1173095703125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5546.114749908447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5374.777774810791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5499.715843200684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5727.480506896973, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4762.509899139404, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4740.871715545654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4904.640007019043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5184.524936676025, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5479.5676612854, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5418.470726013184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5554.413585662842, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5727.69588470459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4751.7206382751465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4738.061141967773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4903.37345123291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5179.840145111084, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5485.44620513916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5408.506565093994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5529.57950592041, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5700.348815917969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4757.808666229248, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4751.684169769287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4917.278347015381, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5181.759376525879, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5499.914436340332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5461.9610023498535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5587.750091552734, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5740.932846069336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5121.468772888184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5075.416164398193, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5739.204483032227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7846.584854125977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5739.576950073242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5431.143550872803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6058.545303344727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8488.752326965332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5238.2073974609375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5097.761116027832, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5702.49870300293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7878.4770584106445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5780.096168518066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5491.426048278809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6018.958568572998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8512.580184936523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5250.671329498291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5082.024211883545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5694.69690322876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7919.532051086426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5784.579372406006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5471.473426818848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6036.342372894287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8564.574279785156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5288.1340408325195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5090.164642333984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5668.336143493652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7917.037124633789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5795.811672210693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5504.110870361328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5994.868011474609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8560.474281311035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6933.077812194824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9977.87338256836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10447.89264678955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6998.233451843262, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9419.699249267578, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9716.440353393555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6991.3177490234375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10006.340827941895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10453.75747680664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7009.168014526367, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9375.206527709961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9745.266418457031, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7065.209732055664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10026.421699523926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10478.891372680664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7125.404891967773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9385.543937683105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9792.298126220703, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7063.996353149414, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10041.904258728027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10494.761810302734, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7111.309547424316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9399.676361083984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9774.037818908691, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3976.5615844726562, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3661.437587738037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3802.838077545166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4128.471527099609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4233.222236633301, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3965.367965698242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4101.369171142578, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4347.764015197754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3982.5880241394043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3647.1345710754395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3763.55411529541, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4108.3514976501465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4254.422721862793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3899.9710655212402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4067.5547218322754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4355.852947235107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3977.1823692321777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3639.569969177246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3756.5689849853516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4102.888488769531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4246.247692108154, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3916.4363288879395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4063.869132995605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4354.857635498047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3966.131076812744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3652.851333618164, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3771.1635208129883, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4087.170925140381, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4235.361251831055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3896.888198852539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4048.754539489746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4368.2402992248535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4860.128269195557, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4339.653720855713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5431.69454574585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5583.976955413818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4665.000591278076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4324.3256187438965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5608.589458465576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5810.016174316406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4878.243026733398, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4247.850227355957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5350.406894683838, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5542.430839538574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4630.462207794189, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4282.983207702637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5511.516456604004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5707.33154296875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4890.0346755981445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4245.938529968262, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5352.949619293213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5560.164966583252, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4683.6944007873535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4238.619518280029, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5513.0682945251465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5727.06579208374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4875.992813110352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4254.8846435546875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5354.137725830078, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5553.392639160156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4698.7470626831055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4241.160469055176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5504.038066864014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5713.783416748047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6160.447177886963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8896.777877807617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7142.779083251953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6663.7260818481445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5955.458526611328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8861.645164489746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7241.336822509766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6665.782623291016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5975.148658752441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8893.209762573242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7256.1761474609375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6693.633346557617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5989.556312561035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8896.155548095703, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7316.997108459473, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6686.792411804199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3411.756172180176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3311.597900390625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3872.435531616211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4017.947502136231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3651.4743995666504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3343.945598602295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3790.4512214660645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3914.3822288513184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3369.582862854004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3199.1705799102783, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3732.6598358154297, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3764.6467208862305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3665.0406455993652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3255.735673904419, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3813.533306121826, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3874.554023742676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3361.0094261169434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3179.795846939087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3726.3537979125977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3765.1804542541504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3694.184799194336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3269.3628883361816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3837.036647796631, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3890.145778656006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3347.337589263916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3169.575662612915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3718.5826110839844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3782.865791320801, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3685.0535583496094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3274.3803787231445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3838.623790740967, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3874.7361755371094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4412.995338439941, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4529.288959503174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4403.776073455811, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4407.669315338135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4226.780014038086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4427.486553192139, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4245.452346801758, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4343.118095397949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4215.660171508789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4430.881462097168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4276.342086791992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4352.206382751465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4197.697582244873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4424.772644042969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4249.654407501221, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4348.68782043457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9956.268844604492, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6132.611827850342, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9914.452590942383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5721.67423248291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9915.113525390625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5726.709575653076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9963.97087097168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5745.419521331787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3288.0713844299316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3440.7533073425293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3533.184986114502, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4117.703990936279, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3020.559377670288, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3217.171401977539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3135.1529598236084, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3363.2597160339355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3428.556308746338, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4233.564605712891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3077.113914489746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3138.3896160125732, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3087.5153827667236, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3367.1657371520996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3415.7278442382812, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4304.258728027344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3058.9667224884033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3139.2878437042236, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3039.786729812622, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3359.0796661376953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3409.5737838745117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4302.924461364746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3076.761131286621, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3132.5169563293457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5709.120826721191, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4337.77811050415, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5501.945781707764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3949.434070587158, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5529.47904586792, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3974.8748779296875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5538.884315490723, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3989.529285430908, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3579.679374694824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3477.0164680480957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3738.2571601867676, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4172.719345092773, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3737.8482055664062, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3696.956615447998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3858.394241333008, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4523.578262329102, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3870.459041595459, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3734.9222373962402, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3910.4451179504395, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4386.444339752197, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3927.2436904907227, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3854.030227661133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4021.6636466979985, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4648.143367767334, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3917.4765014648438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3744.125270843506, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3912.363510131836, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4372.611408233643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3943.5300827026367, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3885.4993438720703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4039.8006439208984, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4611.114368438721, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3929.060935974121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3757.2817611694336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3933.6196517944336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4405.962390899658, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3971.8227005004883, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3995.2252769470215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4136.6657638549805, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4662.1452713012695, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4300.489978790283, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4640.682849884033, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4648.847255706787, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4427.051029205322, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5055.168476104736, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5191.24719619751, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4579.711971282959, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4833.220100402832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4886.607837677002, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4585.250225067139, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5087.9157066345215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5206.540508270264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4609.048328399658, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4840.1225662231445, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4887.937641143799, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4592.612934112549, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5114.92338180542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5207.694129943848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4592.380828857422, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4966.184329986572, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4901.796016693115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4608.4792137146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5195.35774230957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5222.2881507873535, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6292.737102508545, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6416.084156036377, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6681.443252563477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7235.70613861084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6843.65852355957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7262.9949951171875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7099.631423950195, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7125.032081604004, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2660.330228805542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2699.741430282593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3008.1691455841064, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3090.1102352142334, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2841.2643146514893, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2739.583044052124, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3314.29386138916, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3351.932792663574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2697.7091312408447, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2719.2323207855225, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2911.317768096924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3014.7740650177, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2900.0027179718018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2746.8052864074707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3317.6177406311035, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3337.038097381592, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2701.7580890655518, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2711.64701461792, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2929.2761611938477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3015.688304901123, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2889.711494445801, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2723.3454418182373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3315.72359085083, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3343.924789428711, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2689.4153594970703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2693.5574340820312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2975.6169509887695, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3062.254867553711, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2888.156156539917, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2696.095027923584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3328.581771850586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3340.1158332824707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3976.032199859619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3242.522602081299, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4588.773288726807, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3661.8167686462402, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4165.911712646484, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3171.815528869629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4771.067371368408, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3660.054931640625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4221.89245223999, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3169.11057472229, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4794.654693603516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3656.919403076172, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4282.778377532959, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3213.0689430236816, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4823.065624237061, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3661.6787147521973, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2250.163679122925, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2389.010238647461, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2427.599687576294, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2276.1079692840576, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2301.0649585723877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2359.819211959839, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2212.0164680480957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2294.3850994110107, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2225.3878116607666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2271.3478469848633, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2296.613130569458, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2195.966739654541, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2189.402551651001, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2299.847345352173, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2223.944625854492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2253.1158351898193, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2303.5876655578613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2190.855369567871, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2189.1111850738525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2315.421733856201, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2252.940788269043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2252.782096862793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2316.4827251434326, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2211.952495574951, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3048.3747005462646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2964.126558303833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3124.6545600891113, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3031.409730911255, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3150.1737689971924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3046.3764667510986, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3191.504487991333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3081.980972290039, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2653.1222343444824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2369.8475074768066, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2735.572328567505, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2435.5695819854736, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2742.113780975342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2421.7654418945312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2770.2352046966553, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2436.0139179229736, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3506.628475189209, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3608.0536460876465, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3371.234073638916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3577.9761505126953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3515.850601196289, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3516.7840003967285, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3316.903839111328, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3506.32869720459, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3858.8800048828125, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3948.9412879943848, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3792.635040283203, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3824.6449851989746, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3800.840301513672, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3813.681240081787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3674.298572540283, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3716.061420440674, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3886.4574241638184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3977.9373359680176, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3791.8465995788574, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3967.6192474365234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3823.4302139282227, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3834.3567848205566, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3667.2668838500977, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3859.7997093200684, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3906.6576194763184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4005.243988037109, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3789.136619567871, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4155.125885009766, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3843.2198333740234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3869.423007965088, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3676.29695892334, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4045.7315063476562, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4659.537754058838, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4484.645309448242, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5117.50452041626, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4955.887184143066, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5172.786407470703, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4974.456748962402, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5216.9169998168945, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5019.363479614258, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2496.210880279541, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2517.8284740448, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2396.563034057617, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2446.7660903930664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2379.775342941284, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2349.7417545318604, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2573.028335571289, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2571.6137504577637, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2496.100015640259, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2512.514228820801, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2459.3337535858154, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2386.9185638427734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2580.7145404815674, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2583.99582862854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2548.3105659484863, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2515.1895904541016, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2473.508176803589, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2437.9619312286377, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2594.046697616577, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2616.5042972564697, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2580.185146331787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2524.306221008301, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2493.4907245635986, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2472.1491050720215, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3462.219524383545, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2972.778091430664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3604.2043113708496, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3131.0430335998535, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3639.336452484131, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3161.7191791534424, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3681.749267578125, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3194.309787750244, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2761.0827255249023, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2083.7707138061523, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2475.4097652435303, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1823.7574291229248, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2826.931371688843, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2091.691026687622, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2516.9739151000977, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1826.8292713165283, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2845.4255962371826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2107.886390686035, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2531.731996536255, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1830.2620792388916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2886.7788696289062, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2124.3949031829834, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2532.0750427246094, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1843.4707164764404, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9357.18978881836, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1935.8806419372559, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9299.665985107422, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.0145473480225, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9339.782829284668, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1995.8995532989502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9433.947868347168, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2019.573745727539, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4189.321727752686, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3215.68660736084, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4252.872180938721, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3235.896167755127, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4701.358585357666, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3447.967052459717, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4758.719863891602, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3430.4079818725586, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4717.36701965332, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3460.278377532959, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4778.1086349487305, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3441.537628173828, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4823.398418426514, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3521.0648155212402, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4879.204940795898, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3506.706199645996, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3868.5687828063965, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2769.857921600342, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3918.009262084961, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2919.9692916870117, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4039.7025489807124, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2953.751850128174, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4260.711822509766, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3060.1609802246094, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15353.79753112793, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2124.056167602539, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15376.233596801758, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2176.843204498291, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15412.657318115234, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2215.5020904541016, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15426.503601074219, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 3072, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2279.019536972046, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.45039999485016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 150.69792091846466, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 149.7355192899704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.03056073188782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.95344066619873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.68080008029938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.060959815979, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.80224061012268, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.10928070545197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.01664006710052, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.15584015846252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.48559999465942, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.52992033958435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.29471957683563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.09200143814087, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.99055922031403, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.7033599615097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.1153599023819, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.17295920848846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.90783989429474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.49520087242126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.0083191394806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.65439975261688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.8662406206131, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.72784006595612, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.49439918994904, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.37823963165283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.07344043254852, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.99295926094055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.76927947998047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.36784040927887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.74752008914948, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.42127990722656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.4313609600067, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.25791907310486, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.77423977851868, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.5366405248642, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.3931188583374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.56047928333282, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.1374410390854, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.86272037029266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.91935896873474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.21360063552856, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.21856009960175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.04879999160767, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.9203199148178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.05248022079468, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.78048050403595, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.9871997833252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.03648006916046, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.37999892234802, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.65424036979675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.96384024620056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.22560095787048, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 185.2950394153595, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.53231966495514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.79983949661255, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.01967930793762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.86303961277008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.01760005950928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.16432011127472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.55104076862335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.43856120109558, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.3924798965454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.37008094787598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.56496012210846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 230.8844769001007, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 230.60800075531006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.93551993370056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.59008073806763, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.19871830940247, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.59888100624084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.19296061992645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.89247858524323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 231.6652810573578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 231.54736042022705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.4027203321457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.43359971046448, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.83327770233154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.00016021728516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.25824058055878, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.32352018356323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.94415950775146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.33104038238525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.81968021392822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.57551956176758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.4515197277069, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.6992003917694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.36015856266022, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.26719880104065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 233.24703931808472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.65280055999756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.49711906909943, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.2491194009781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.32896256446838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.16896200180054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.69264030456543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.8857605457306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.9398386478424, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.15072095394135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.40432024002075, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.5068792104721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.16815972328186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.4772790670395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.5630396604538, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.1950399875641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.1499207019806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.70640003681183, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.53648054599762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.3998395204544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.1524807214737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.98399913311005, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.26672065258026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.55951976776123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.85712039470673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.31472027301788, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.61184084415436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.39295935630798, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.16511988639832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.1276797056198, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.66991865634918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.24351906776428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.010560631752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.34303975105286, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.00735998153687, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.10719847679138, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.199840426445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.44687938690186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.31727957725525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.80768084526062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.3873610496521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.91631960868835, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.51616048812866, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.96559989452362, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.90031898021698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.19999992847443, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.17680060863495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.08559834957123, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.57824039459229, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.59632062911987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.72127985954285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.92304074764252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.01920127868652, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.90704035758972, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.1710386276245, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.22160017490387, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.5967993736267, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.1868795156479, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.22224009037018, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.82863926887512, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.09647965431213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.8220797777176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.36751890182495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.4454402923584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.51712048053741, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.59407937526703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.8430414199829, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.7148813009262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.44256055355072, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.89279973506927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.44048023223877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 204.4696009159088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.13263976573944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.49151873588562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.14208030700684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 200.76879978179932, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.95296132564545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.8542401790619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.1551994085312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 203.20144057273865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.4385598897934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 185.22816061973572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.55791974067688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 203.7329602241516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.5001586675644, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.52832078933716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.74160051345825, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.60736083984375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.7956793308258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.51119947433472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.56512022018433, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 150.91855883598328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.354079246521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.14991998672485, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.98976004123688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.9126387834549, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.73264026641846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.83071970939636, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.6275199651718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.50543999671936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.83551919460297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.77504110336304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.42975914478302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.38704097270966, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.93152022361755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.01104032993317, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.4513601064682, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.91376042366028, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.08207952976227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.8555190563202, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.99152040481567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.71023952960968, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.76879930496216, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.01855874061584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.0619193315506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.3384004831314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.10511946678162, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.87984085083008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.75295972824097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.15808033943176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.03152060508728, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.16528034210205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.6841596364975, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.0489593744278, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.21919977664948, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.06384015083313, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.29728066921234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.78608000278473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.76623928546906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.4524803161621, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.56639957427979, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.89151895046234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.12063896656036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.67711997032166, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.52255988121033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.38416051864624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.86464047431946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.4291205406189, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.11039912700653, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.26736080646515, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.52336061000824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.90527963638306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 259.579039812088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.1216002702713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 261.8436801433563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.86928033828735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 260.44528126716614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.8523187637329, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 263.23792338371277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.60656070709229, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.0004804134369, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.37855851650238, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.8857593536377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.63296175003052, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.0030403137207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.63072049617767, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.2297601699829, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.67119979858398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.9124791622162, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.50512146949768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.55184054374695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.19440078735352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.49727964401245, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.37599980831146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.25760090351105, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.9635202884674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.59504067897797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.15344083309174, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 193.53616058826447, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.9177609682083, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.34287905693054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.49551928043365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.4083207845688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.47408092021942, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.68063962459564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 185.73984026908875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.1723198890686, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.3742400407791, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.33616054058075, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.03407907485962, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.89024031162262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.23391997814178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.96016025543213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.91359865665436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.34271812438965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.62304186820984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.21487975120544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.9670408964157, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.75471985340118, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.69840002059937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.1233607530594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.74863970279694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.4263995885849, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.10112011432648, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.45759975910187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.896479845047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.36655950546265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.75375962257385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.97776019573212, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 152.56959974765778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.4251207113266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.47439908981323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.0039985179901, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.14047861099243, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.53871977329254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.80736076831818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.13104021549225, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.0048007965088, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.28239905834198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.5307193994522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.6656002998352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.29392170906067, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.59440004825592, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 196.32783830165863, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.62688064575195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.86799955368042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.32800030708313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.2555195093155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.5619192123413, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.12976050376892, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.44992101192474, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.8102412223816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.6822406053543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.32912051677704, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.38991963863373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 200.06319761276245, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.0823996067047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.89967954158783, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.32176005840302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.55328011512756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.68655967712402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.67535960674286, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.22607898712158, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.72272372245789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.86719965934753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.46352005004883, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.32240045070648, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.2708775997162, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.3342399597168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.88031935691833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.73312056064606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.58175897598267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.95983910560608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.70463871955872, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.7838408946991, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.30447924137115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.78480052947998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.6644802093506, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.07919669151306, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.48752164840698, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.25311756134033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.39935839176178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.4579187631607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.26864171028137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.88159823417666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.4624000787735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 224.87695813179016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.32368111610413, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.20944106578827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.81952118873596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.83712244033813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.49727964401245, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 227.5519984960556, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.5512011051178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.51216113567352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.13535904884338, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.40959930419922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.63311982154846, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 227.93264091014862, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.2776017189026, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.743199467659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.2022407054901, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.12400043010712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.8691202402115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.7564799785614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.09503960609436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.60639917850494, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.13216030597687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.63296020030975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.82367980480194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.12992131710052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.8766404390335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.47664082050323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.93616092205048, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.984800696373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.757758975029, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.92400085926056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 153.69823813438416, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.0063999891281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.42656135559082, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.55247914791107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.61103999614716, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.29135990142822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.19727861881256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.7915209531784, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 154.5574390888214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 151.07968151569366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.80560171604156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.5083202123642, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.39759957790375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.0598406791687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.25311923027039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.8224000930786, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.68255925178528, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.7468799352646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.32255935668945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.59264087677002, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.27200043201447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.04255974292755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.90607941150665, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.24704158306122, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.91567969322205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.55248081684113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.17872047424316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.4097602367401, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.06048047542572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.67519783973694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.71871995925903, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.4862381219864, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.17551958560944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.40143883228302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.81695973873138, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.553280711174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.13776004314423, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.33488059043884, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.72944116592407, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.37648034095764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.55328047275543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.49967873096466, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.27728056907654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.9236809015274, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.89984214305878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.43728017807007, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.89904034137726, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.45871901512146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 231.47487878799438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 207.36495971679688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.82607913017273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.81967997550964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 230.35696029663086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.04048132896423, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.15471816062927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 185.25120079517365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 229.41807985305786, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.3483190536499, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.93311965465546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.84720063209534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 231.87552213668823, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.9705581665039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.8438402414322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.63039934635162, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.62480008602142, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.74032056331635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.34880018234253, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.56303906440735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.83616018295288, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.4902399778366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.9276807308197, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.41120088100433, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.79808020591736, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.81232011318207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.91087925434113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.08816003799438, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.1292805671692, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.9582403898239, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.33183991909027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.96863925457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.0087994337082, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.51327979564667, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.3899201154709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.47488141059875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.76576161384583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 157.35455989837646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.48287868499756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.28928089141846, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.15184140205383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.0643196105957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.66607999801636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.8403195142746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 156.78703904151917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 155.29919981956482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.48640024662018, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.83759808540344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.90367913246155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.05728149414062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.45280003547668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.10607981681824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.1484798192978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.722238779068, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.0113605260849, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.8334412574768, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.17631912231445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.28896045684814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.50672006607056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.0951999425888, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.9264007806778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.1619223356247, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.7431995868683, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 315.8030414581299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 186.55376076698303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 314.1470408439636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.68752002716064, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 313.718878030777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 192.74671971797943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 312.5377595424652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.49568057060242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 211.00928008556366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.43535923957825, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.0980784893036, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 205.11056005954742, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.88384091854095, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.31887888908386, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 212.02159881591797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.9027200937271, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.79312086105347, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 209.25599932670593, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.1166399717331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.47167909145355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.49951922893524, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.47871923446655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.02896010875702, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.77856063842773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.92063987255096, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.3062402009964, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.55487871170044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.12224090099335, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.82448017597198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 209.62159872055054, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.2233612537384, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.0076801776886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.44864201545715, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.7331190109253, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.0635199546814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.725279211998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.3529658317566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.99152100086212, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.4566388130188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 198.22880148887634, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.69360053539276, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 179.85759973526, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.20048010349274, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.20736038684845, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.79168117046356, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.96319949626923, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.79311990737915, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.34767985343933, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.6791990995407, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.53760063648224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.03488051891327, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.14975833892822, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.9319999217987, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.62143921852112, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.70431971549988, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 212.20144152641296, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.51007986068726, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.04687929153442, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.49151968955994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.13551926612854, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.64543986320496, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.1158413887024, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.14223992824554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.34960162639618, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.4664009809494, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.4425595998764, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.08463847637177, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.81535875797272, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.87183952331543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 188.7304002046585, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.3415995836258, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.64767956733704, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.73680138587952, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 213.74927937984467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.4323160648346, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 234.87696170806885, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.11264038085938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.46831893920898, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 196.74911975860596, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 212.60720074176788, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.73920142650604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.6107213497162, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 229.1430377960205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 226.0867202281952, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.46607875823975, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 216.69024109840393, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.80512046813965, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 233.33600163459778, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 231.73727869987488, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 229.18943762779236, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.16432082653046, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 212.0748782157898, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 210.86415767669678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 232.87392020225525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.79472136497498, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 230.30336260795593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.5622384548187, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 296.5008008480072, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 285.1742386817932, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 294.405118227005, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.7708809375763, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 300.2516806125641, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 292.02272057533264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 297.50223755836487, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.60176002979279, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.60128045082092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.7908810377121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.66800010204315, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 176.74912095069885, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.0027197599411, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.6252804994583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.21456038951874, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.34960079193115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.7959998846054, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.76576030254364, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.8563185930252, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.71999967098236, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.314399600029, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.04816055297852, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.10815978050232, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.78799974918365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.16352033615112, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.9838389158249, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 170.23151993751526, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.89567971229553, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.15759921073914, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.25039982795715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.20527935028076, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 167.81984090805054, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.04127979278564, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.13072037696838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.015199303627, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.47120141983032, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.4281586408615, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.71311902999878, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 174.7104001045227, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.92512094974518, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.34607899188995, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 227.73215889930725, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.33023929595947, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.14239859580994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.5110386610031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.7659239768982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.1804802417755, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 205.0601589679718, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.49696099758148, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.99072408676147, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.093279838562, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.10463917255402, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.28672075271606, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.5974419116974, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.06976056098938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.73232126235962, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 165.281919836998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.064000248909, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.10015964508057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.7222397327423, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.23727977275848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.35791981220245, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 166.44047915935516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.52815961837769, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.59343934059143, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.45616137981415, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 160.28223931789398, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.36703991889954, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 164.74832117557526, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 162.2865605354309, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 173.46239984035492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 163.4827196598053, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.3974405527115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 175.3734403848648, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.63552105426788, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 159.650399684906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 172.8736013174057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 161.9953602552414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 158.8182407617569, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.12192142009735, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 177.96160101890564, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.69151973724365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.7414401769638, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.54399859905243, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 180.46752035617828, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.10336124897003, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 178.71616005897522, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 227.779198884964, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 207.62336134910583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.58287930488586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 206.46415948867798, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 227.30000138282776, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 205.49311876296997, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 228.69855880737305, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 204.8859190940857, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.0721607208252, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 273.7169587612152, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 244.03184056282043, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.80335879325867, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 290.05072236061096, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.18207812309265, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 247.00400233268738, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.38623976707458, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 288.5219204425812, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 277.09375977516174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.77520370483398, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 249.65999960899353, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 292.14239954948425, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 272.40320086479187, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.74687957763672, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 251.80896162986755, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 287.74927973747253, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.74896335601807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.5024013519287, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.53792142868042, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.2574405670166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.5294370651245, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.6815996170044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 252.3460793495178, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.96991991996765, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 278.9619183540344, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 245.34991979599, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 248.07568192481995, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 289.84912037849426, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.0611209869385, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 246.1238396167755, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 250.45407891273496, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.4670422077179, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 341.28031969070435, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.1727979183197, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 339.3601596355438, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.49743843078613, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 337.02688217163086, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.79840064048767, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 334.4364798069, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 204.90207970142365, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 197.42191970348358, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 187.72048115730286, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.3379204273224, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.38480019569397, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.08336114883423, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 205.77903747558594, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.92095935344696, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.41392064094543, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.01472079753876, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 194.48048055171967, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.8647998571396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 203.16927909851074, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.9374407529831, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 189.26367938518524, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 201.61232113838196, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 195.0217628479004, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.95680105686188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 203.39184165000916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 200.48672378063202, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 190.26671886444092, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 199.8686408996582, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 191.29791975021362, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 183.61423909664154, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 267.8993618488312, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 239.57088112831116, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 271.0646402835846, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 236.96399927139282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 268.3617603778839, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 237.1057629585266, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 270.80623984336853, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 235.5449616909027, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.28208136558533, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 182.99296081066132, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 208.6732804775238, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 169.4488000869751, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.94768166542053, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 184.64687943458557, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 209.83007788658142, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.3123208284378, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.6771218776703, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 181.56256079673767, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 209.6016013622284, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 171.4497607946396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 225.54175853729248, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 185.20048022270203, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 209.4760024547577, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 168.41823995113373, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1046.9865655899048, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.89664149284363, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1039.7495985031128, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 242.25871801376343, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1048.6976099014282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 240.28000116348267, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1049.9424028396606, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 241.4414393901825, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.833432674408, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.8878357410431, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.4657554626465, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.32623958587646, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.890079498291, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.58991980552673, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.1241636276245, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.22288155555725, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.0121564865112, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.59711813926697, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.8275213241577, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 399.49488282203674, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.8575963973999, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.46767950057983, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.6048030853271, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 402.61215806007385, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.027045249939, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.73151993751526, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.4056024551392, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.46607971191406, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.9331202507019, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.94943833351135, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.6887955665588, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.10223841667175, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2004.9923133850095, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 280.5076801776886, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2004.9726390838623, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 279.371680021286, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2009.775676727295, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 277.35008120536804, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2001.6785526275632, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 275.7851207256317, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.5899221897125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.41791915893555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.3571183681488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.5233588218689, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.6763210296631, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.85487961769104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.32143998146057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.7729594707489, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.96288204193115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.6679992675781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.2633628845215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.765442609787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.894079208374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.8280007839203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.6251208782196, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.41264390945435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.40736150741577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.8375999927521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.15296244621277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 349.55039858818054, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.0814392566681, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.0839982032776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.1713614463806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.5841598510742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.0174403190613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.9780797958374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.2766389846802, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.79936051368713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.05168080329895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.54928064346313, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.0457592010498, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.09791898727417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.1951994895935, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.7022387981415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.80336117744446, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.1308796405792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.82352089881897, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.44288086891174, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.932000875473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.73775911331177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.8291189670563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.7942407131195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.31103920936584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.4702398777008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.67232060432434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.1244761943817, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.04992294311523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.802237033844, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 344.79056000709534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.85151982307434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.01856303215027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.92719650268555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.5420799255371, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.1031982898712, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.7860805988312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 391.2289583683014, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.497918844223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.41663932800293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.31024146080017, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.9520003795624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.7297646999359, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.302401304245, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.9635193347931, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.87743949890137, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.0740761756897, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.2872009277344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.44592332839966, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.9347233772278, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.2379174232483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.87295484542847, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8046350479126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9209585189819, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.44351983070374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.25920033454895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.3529648780822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.91695976257324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 425.6774389743805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.61424112319946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.1743965148926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.7612724304199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.32080006599426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.9553575515747, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.55631923675537, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2961602210999, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 421.61983847618103, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.2192015647888, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6585645675659, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.4771203994751, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.9540753364563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 397.08863854408264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.99551916122437, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.04512166976934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 414.6875214576721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 420.44528007507324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5196838378906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.514726638794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.43135929107666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 339.58848118782043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.0415999889374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.9095993041992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.9635200500488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.93136048316956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.6720037460327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.2540822029114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.1446385383606, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.32384276390076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.43264079093933, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.9241580963135, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.94656109809875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.6916787624359, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.65856170654297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.6825602054596, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.0950403213501, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 341.18223786354065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.48560094833374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.95504117012024, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.0113613605499, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.9260823726654, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.3894410133362, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.22528100013733, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.30623841285706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 341.11727833747864, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.6046419143677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.47183990478516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.12752079963684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.4619174003601, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.64095997810364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.90176010131836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.80735778808594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 344.86608266830444, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.4193594455719, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.2404816150665, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 349.7056007385254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.04111981391907, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.0017580986023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.52351927757263, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.01024055480957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 349.2580807209015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.84815788269043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.32255959510803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.61375856399536, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.4623987674713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.37487840652466, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.48256158828735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.93232226371765, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.7238392829895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.56447982788086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.5804777145386, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.865761756897, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.7179214954376, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.51279640197754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.67648005485535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.8604781627655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.3808009624481, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.20687770843506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.6556794643402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.6340775489807, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 348.7652778625488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.4944031238556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.5984010696411, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.55359721183777, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.8644742965698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.25072288513184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.88320446014404, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.0764811038971, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.0502419471741, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.0169606208801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.14671874046326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 397.8484785556793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.98735666275024, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.6444787979126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 413.2750380039215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.47216176986694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.9644775390625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.2780821323395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 412.89552330970764, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.72896122932434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.0860800743103, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.5313606262207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.76383924484253, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 342.6635229587555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.0982406139374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.207679271698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.72832131385803, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 343.8540816307068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.13647985458374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.7230398654938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.81967878341675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.9747188091278, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.5580792427063, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.17903995513916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.0478403568268, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 345.17711877822876, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.62160062789917, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.1153597831726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.8246397972107, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.45887994766235, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.90367913246155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.9979181289673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.45792174339294, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.8243193626404, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.0964787006378, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.67455887794495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.9927999973297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 341.6254389286041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 346.0527992248535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.40320205688477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.9851191043854, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.60816383361816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.03568053245544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.6684787273407, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.78528237342834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.65312099456787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.7436797618866, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.4284813404083, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 379.1543996334076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.04896235466003, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.99823808670044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.2345595359802, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.42816138267517, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.43200159072876, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.5271985530853, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.38592052459717, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.9892797470093, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.82623839378357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.7487995624542, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.9412808418274, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.19039821624756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.2830390930176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.2203199863434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.511198759079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.1075220108032, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.1379237174988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 417.2507178783417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.6563243865967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 416.0483229160309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.801598072052, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 419.39695596694946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.7751998901367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 416.7587184906006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.5607979297638, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.439838886261, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.6131205558777, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.41167879104614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.688481092453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.7660789489746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.9688003063202, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.31936287879944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.59983706474304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.0332806110382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.2094385623932, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.1113615036011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.68111872673035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.59967947006226, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.39983916282654, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.1115207672119, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.2902412414551, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.9166407585144, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.65312099456787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.84272170066833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.77775979042053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.8860788345337, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.5297598838806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.07247829437256, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.2803225517273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.4257607460022, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 422.7028822898865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 411.20911955833435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 424.1646361351013, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 403.1371212005615, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.1009578704834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 397.014080286026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.9987196922302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 351.6006398200989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.38112235069275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.65184020996094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 418.2320022583008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.1993598937988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.6611201763153, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.2161555290222, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.8483204841614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.9476807117462, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.3263998031616, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.547518491745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.70048213005066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.4912037849426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.42352056503296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.0630407333374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.2523195743561, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.7355201244354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.9225609302521, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.8668808937073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.93343901634216, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.28959941864014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 388.19488048553467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 388.08544278144836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.1404821872711, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.07983922958374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.0779182910919, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.40111804008484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 381.7400002479553, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.19904088974, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.0273609161377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.0462417602539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.73519921302795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.0271999835968, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.5371198654175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 453.8923192024231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.2135977745056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.08367919921875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.34271788597107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.241916179657, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.4007980823517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.72255992889404, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.2268810272217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.80208444595337, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 402.5428819656372, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.8991997241974, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.215039730072, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.2887969017029, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.5060818195343, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.52143955230713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.6553599834442, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.00783801078796, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 400.23695826530457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.11504340171814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.5068814754486, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.24592113494873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.6883237361908, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.9115207195282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.4185588359833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.7844805717468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.15007758140564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.24959993362427, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.43903732299805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.41440057754517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.4249610900879, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.7996826171875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.9969639778137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.81311559677124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7753615379333, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.9249625205994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.10383892059326, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.3755221366882, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.4019150733948, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.260968208313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.7377586364746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.5563173294067, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.3742370605469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.6419191360474, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.2289552688599, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.3734402656555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.0355205535889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.8134393692017, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.6851134300232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.3137617111206, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.0582404136658, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.7771203517914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.0260806083679, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.2487988471985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 379.76768016815186, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.4931173324585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.58335995674133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.679678440094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.4758417606354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.73792028427124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.57776141166687, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.63135838508606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.1825575828552, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.11215901374817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.2548773288727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.073118686676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 394.5423996448517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.81087851524353, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.5443186759949, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.0348825454712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 394.0619206428528, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.4127984046936, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 353.879998922348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.7588815689087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 400.3596806526184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.56176352500916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.20928263664246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.5992012023926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.40607810020447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.5083222389221, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 355.50928235054016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.85007905960083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.11424136161804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.7977600097656, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.62751722335815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.4003210067749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.8984007835388, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.5193591117859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.09023809432983, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 402.92192339897156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.5527982711792, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.27455735206604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 381.62495970726013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.2089583873749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.3169593811035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.44816279411316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.5252802371979, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.53264117240906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.8889584541321, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.59439992904663, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.7447998523712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.9720034599304, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.84640192985535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.8728015422821, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.87167477607727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 397.1996808052063, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.19983768463135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.58976125717163, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.97712087631226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.99295687675476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.16992020606995, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.27711820602417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 388.1816029548645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 403.5488021373749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 381.84383749961853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.83935832977295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.4552011489868, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.722718000412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.60400581359863, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.3881583213806, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.2761559486389, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.42463731765747, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 451.9806408882141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.4912042617798, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.49087476730347, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.0980851650238, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.39424085617065, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.8846406936646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.7939205169678, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.7867183685303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.02688121795654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.5734438896179, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.86848306655884, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.5278356075287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.7054388523102, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 369.00784373283386, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.3088004589081, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.62479877471924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.85263562202454, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 379.19536113739014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.7564797401428, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.08079981803894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 347.53103971481323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.3243179321289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.4099187850952, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.8193590641022, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.1049563884735, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.12287974357605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.2011194229126, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.7438397407532, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 354.313600063324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.6476786136627, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.011682510376, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.8743999004364, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 403.64431858062744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.81647849082947, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 360.33088088035583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.92848014831543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 350.68943977355957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.0681571960449, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.2251205444336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.4359998703003, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 410.67471742630005, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.6622385978699, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.65808057785034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.219521522522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.8302412033081, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 387.39375948905945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 418.1006383895874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.7520024776459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.27135276794434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 385.8844804763794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 425.2235162258148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.9435176849365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.2632019519806, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.34111857414246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 427.8235173225403, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.71247720718384, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 400.7270383834839, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 391.0100769996643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 428.24560165405273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.6801574230194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.2201633453369, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.756959438324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 785.9078407287598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 445.2998352050781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 773.4543943405151, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.35072088241577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 782.4633526802063, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.55023884773254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 356.28512144088745, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 401.4401614665985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 391.83664202690125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.3164849281311, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 376.30783796310425, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.61056089401245, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 357.44704246520996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.1374409198761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.653920173645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 439.7168028354645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.89503931999207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.4851200580597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.1511986255646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 402.015997171402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.6796820163727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.1756854057312, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 382.6379179954529, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.1532790660858, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.89264130592346, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.28080463409424, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 391.7915213108063, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 442.0246386528015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.45679998397827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.09855461120605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.7404832839966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 427.10432052612305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.0264048576355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.5956811904907, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.1567983627319, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 416.99999809265137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.6719994544983, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 422.7384042739868, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 413.5652816295624, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.6907217502594, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.21552205085754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.27584409713745, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 431.3356876373291, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.71440410614014, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.39040207862854, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.7235198020935, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.8988780975342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.2407991886139, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 380.12719988822937, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.5287938117981, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 437.84703731536865, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 384.74496126174927, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 409.3963158130646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.32032203674316, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 418.19632291793823, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.04255747795105, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.824960231781, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 446.679847240448, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.15071964263916, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 388.77920031547546, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.0116775035858, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.348002910614, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 413.7985599040985, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.18127727508545, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.6857604980469, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 450.0094413757324, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 438.58800172805786, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 386.2455987930298, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.30256032943726, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.34704065322876, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.5243215560913, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.8396773338318, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.1598482131958, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.4027171134949, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1940841674805, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.6007943153381, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.8694396018982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.1163239479065, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.8313660621643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.6585597991943, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.717116355896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.5123205184937, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 449.64112520217896, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.83039999008184, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.3241629600525, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3905620574951, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.8428783416748, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.1305575370789, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.13823795318604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.60111999511713, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.55247831344604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.7777619361877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.4174389839172, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.3292832374573, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.331356048584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.359842300415, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.4406394958496, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.181435585022, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 686.800799369812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.3745579719543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.4708819389343, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.721284866333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 365.24911761283875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 364.1425585746765, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 383.341760635376, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.20928168296814, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 390.83712220191956, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.58672165870667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.88511657714844, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.49280071258545, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.1388795375824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.2590401172638, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.95647978782654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 379.3675231933594, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.71935749053955, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.8390402793884, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 398.14048051834106, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.55919790267944, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.6742398738861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.6772794723511, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 378.2427203655243, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.82736015319824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.0296006202698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.3807978630066, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.18912267684937, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 393.3033585548401, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.3095989227295, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 366.48624062538147, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 373.5376012325287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 375.80928087234497, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.2969596385956, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 372.59616136550903, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 394.25456166267395, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 389.13376331329346, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.06367921829224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 416.65743827819824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.2703976631165, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 410.9447991847992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.2446389198303, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 421.3369596004486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.409761428833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 413.63168001174927, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.2580828666687, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 415.29184341430664, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.8752021789551, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 414.4822382926941, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.02303981781006, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 421.29440784454346, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.1267223358154, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 412.3483216762543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.3872013092041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 370.96912026405334, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 352.2875213623047, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 407.02927470207214, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 368.87807965278625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.50127840042114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 394.9934387207031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.57168340682983, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 363.295681476593, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 404.59535121917725, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.4092798233032, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 359.00784134864807, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 396.4246428012848, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.40368127822876, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 367.07056045532227, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.6971187591553, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 371.86463952064514, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 358.40416073799133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 395.06096363067627, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 377.21984028816223, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 362.0948803424835, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 406.5184020996094, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 374.0825581550598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 361.2387239933014, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.95408487319946, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 424.2969584465027, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 431.90751791000366, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 426.376314163208, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 429.4601607322693, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 427.5508785247803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 433.93439769744873, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 423.7883222103119, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.1495933532715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.93279933929443, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.0990381240845, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 434.5747184753418, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.3980793952942, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.6902446746826, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.1707158088684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 432.9801559448242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.3433632850647, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.4752011299133, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.4640030860901, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.419683933258, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 685.7265591621399, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.6284809112549, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.9652786254883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.1249613761902, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.6464033126831, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.178081035614, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.9079976081848, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.0596823692322, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.9752039909363, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 637.0489621162415, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.6879992485046, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.2524814605713, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.7729578018188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.2892823219299, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.4974374771118, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.0148782730103, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 676.7872071266174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.55504322052, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.8300805091858, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.6980786323547, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.3321633338928, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 650.3438329696655, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.3699131011963, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.726719379425, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.2940802574158, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.5793724060059, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.8817591667175, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.19952917099, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 816.7271971702576, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 804.858386516571, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.522566318512, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 784.1417622566223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 815.1971197128296, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 789.3760061264038, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 820.2936053276062, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.3564829826355, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.9671995639801, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 455.3204846382141, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 425.29152154922485, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.5383915901184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 435.8899235725403, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 410.2552008628845, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2796783447265, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 457.8385579586029, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 436.46512031555176, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.05856132507324, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 443.22304010391235, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 417.2472023963928, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.56783390045166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.4348797798157, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 428.0430340766907, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.62896156311035, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 447.68944025039673, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 413.3393609523773, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.1817612648011, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 452.4132823944092, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 425.84911823272705, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.95551919937134, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.0324811935425, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 408.69728088378906, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.13840675354, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.4559993743896, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.6900806427002, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.984959602356, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.1736001968384, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.9135999679565, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.827356338501, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.0371255874634, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.9036831855774, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 441.5608024597168, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.86783313751215, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 400.36367893218994, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.0955200195312, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.97567653656006, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.4403190612793, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 405.0486397743225, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.8708834648132, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 440.81536054611206, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8788771629333, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 401.7859184741974, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.4353585243225, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 444.0551996231079, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3358335494995, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 403.38640093803406, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2157.370252609253, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.2390398979187, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2155.1417446136475, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.4395213127136, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2131.9035243988037, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.1879997253418, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2130.774555206299, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.282399892807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1347.037591934204, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.356481552124, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1332.7947187423706, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.4536037445068, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1370.317120552063, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 958.6412858963013, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1347.9142379760742, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 980.7564830780029, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1312.3680114746094, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.7958307266235, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1353.0207967758179, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.7055912017822, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1315.1753616333008, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 922.0108890533447, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1355.9905529022217, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 957.0036745071411, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1369.8750305175781, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.3732786178589, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1363.5646343231201, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.1299209594727, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1358.3686447143555, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.3976020812988, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1360.46639919281, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.8870277404785, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5400.282554626465, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.2171168327332, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5521.018867492676, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.3126459121704, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5568.6542320251465, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.9088039398193, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5514.854431152344, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 8, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.6579208374023, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.400643825531, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.9092836380005, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.1172823905945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.7408003807068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.987361907959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.8251152038574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.79264307022095, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.63056087493896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.01664304733276, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.4689564704895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.8795237541199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.2563228607178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.0180835723877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.7279987335205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.87760066986084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.448956489563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.9009623527527, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.80447816848755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.23360204696655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.8470411300659, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8371243476868, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.03808307647705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.84784173965454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.77503633499146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.5131254196167, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.0177607536316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.0947184562683, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.44032096862793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7126398086548, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.16592264175415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.15376329421997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.04224395751953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.3411202430725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.4599928855896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.8124785423279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.7155222892761, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.070237159729, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.8988814353943, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.5513606071472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6244759559631, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.0236768722534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.5019235610962, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.53408002853394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.3302412033081, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.88192272186285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.9823932647705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.9681644439697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.3988819122314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.43344354629517, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.79311895370483, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.5998339653015, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.5876798629761, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.80431890487677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.8788814544678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.5124807357788, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.3894371986389, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.73055839538574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.4707221984863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.3148784637451, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.62432050704956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.1174383163453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.94544076919556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.05744314193726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.1419253349304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.6916847229004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.0177612304688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.5499229431152, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.2756838798523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.9065656661987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.7987236976624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.3843207359314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.281277179718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3382396697998, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.5774435997009, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.9612793922424, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.6553583145142, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.6577596664429, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.1633577346802, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.129280090332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.0787229537964, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.3304018974304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.0911989212036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.7529635429382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.34832239151, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.1279997825623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.7484831809998, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.6841607093811, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 721.8694376945496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0521626472473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.4187164306641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 683.6155223846436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.5699162483215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.3118405342102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4272003173828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.6577596664429, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 720.1807975769043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.03855419158936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.89216136932373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 461.84208631515503, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.059841632843, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.6257586479187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.1444821357727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.69184160232544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.1319971084595, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.5099182128906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.04576206207275, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 465.6878423690796, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.067675113678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.4124798774719, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.1865644454956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.3537669181824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.47711992263794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.89551877975464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.04719400405884, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 464.06415700912476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.33039712905884, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.64367628097534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.8473539352417, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.3446407318115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.8161563873291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.1214408874511, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.36927366256714, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.65040016174316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.42336320877075, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.2849626541138, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.8023986816406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 463.82015228271484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.87744092941284, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.6052794456482, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.22704553604126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.3409605026245, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.8120031356812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.60399770736694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.67215728759766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.9139189720154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.48047971725464, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.4555196762085, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.9766402244568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.34255933761597, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.4798374176025, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.7265558242798, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.6156826019287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.45248222351074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.78463840484625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.96255922317505, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.70303201675415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.1379227638244, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.8209643363952, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.6668839454651, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.48415660858154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.90943670272827, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.9489541053772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.59663677215576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.06143522262573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.68864011764526, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.21823501586914, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.38223695755005, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.2768020629883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.8963165283203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.07456398010254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.8452806472778, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.0979194641113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.28928136825556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.133120059967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.8363146781921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.1732873916626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9248013496399, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.8006434440613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.1726393699646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.2156763076782, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.7091193199158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.4611191749573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.8033547401428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.7782368659973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.08592033386236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.2017631530762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.63040351867676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.1135993003845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.91632080078125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.9867219924927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8675165176392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.7299189567566, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.1996831893921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 468.69919776916504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.27152395248413, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.8038401603699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.63552141189575, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.6734414100647, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.7776007652283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.88143730163574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.33904218673706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.5924777984619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.23855781555176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.999520778656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.98703956604004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.47151803970337, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3892774581909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.0939211845398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.9155201911926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.56912183761597, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.78975439071655, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.471200466156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.0088028907776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.9056005477905, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3862357139587, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.6009564399719, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.36463832855225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.01903533935547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.49152135849, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.8369626998901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.1068820953369, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.40688276290894, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.7883176803589, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.02847623825073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.8704047203064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.90239763259893, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.94143772125244, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.30656147003174, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.0460777282715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.05200147628784, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.87312173843384, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.0803198814392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.1115174293518, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.60016107559204, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.1436758041382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.8487935066223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.4116792678833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3166465759278, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.97376585006714, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.86064529418945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.5206422805786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.1236791610718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 847.0975923538208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.7028818130493, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.7065601348877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.457437992096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.2985725402832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.5515203475952, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 862.2068881988525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.4817633628845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.8278388977051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.0052766799927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.29439878463745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.99135637283325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 466.1006426811218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 469.05664682388306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.82144021987915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.1884789466858, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.77232217788696, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.1883158683777, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.51343965530396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.58128023147583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.4750409126282, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.49439811706543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.6110324859619, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.5860815048218, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.14816093444824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.30335521698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.56463956832886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.43007802963257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.52992391586304, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.34608030319214, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.78256273269653, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.4327988624573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7833614349365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.467041015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.3667178153992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.6975932121277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.4921598434448, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.59343671798706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8545589447021, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.39695978164673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.0692811012268, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.6680030822754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.31728315353394, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.3214421272278, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.8332777023315, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.97855615615845, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.2771215438843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.5403218269348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.5910358428955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.42703914642334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.22720193862915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.9544024467468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.580002784729, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.2416038513183, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.1864042282105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.44112205505377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.9291248321533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.1135983467102, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.61759901046753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.0438389778137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7072019577026, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.9804797172547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.4688024520874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.5473628044128, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.5652766227722, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.69215965270996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.3788814544678, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.264639377594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.4707236289978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.0300760269165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.7937636375427, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4462366104126, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.4332814216614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.5544023513794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.88447237014765, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.3735918998718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.3686370849609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.4799971580506, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.4604802131653, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.2886424064636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.9265565872192, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.84047842025757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.41616058349615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.1624002456665, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.0084772109985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.6414394378662, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.529757976532, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.2883205413818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.0571188926697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.7320022583008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.75535821914673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.4038400650024, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.0808029174805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.3692841529847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.6886386871338, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.9918508529663, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.3352003097534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.9107098579407, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.41984033584595, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.1582398414612, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.6043210029602, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.99727630615234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7839941978455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.3747200965881, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.9831981658936, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 783.0198454856873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 814.6391940116882, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.4806361198425, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.4558424949646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.9104022979736, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.9678425788879, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 780.861759185791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 806.8563151359558, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.1468782424927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.4620785713196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.3959937095642, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.197114944458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 787.6835203170776, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 806.4433646202087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.4460830688477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.0347213745117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.1579179763794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.5918412208557, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.4144034385681, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.6856060028076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.6097555160522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.811520576477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.6844806671143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.62672185897827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.2084822654724, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.14895725250244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.05984258651733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.92447328567505, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.32592010498047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.6969618797302, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.43584299087524, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.91135931015015, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 472.64559507369995, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.299204826355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.5310368537903, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.66847944259644, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.0955286026001, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.6545567512512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.204158782959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 479.90400314331055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.0683250427246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.7543969154358, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.7822437286377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.02527952194214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.6473593711853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.94944429397583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.0964789390564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.2278437614441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.7108874320984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 467.2971200942993, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.6843204498291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.5854392051697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.7180790901184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.2276773452759, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.8510413169861, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2492828369141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.9841604232788, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.8430423736572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.4569606781006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8008027076721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.5592007637024, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.9675207138062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.9864010810852, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9407949447632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.0883193016052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.7487988471985, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.992636680603, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.231999874115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.40223836898804, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.6191935539246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.2566418647766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.6510338783264, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.30831384658813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.7609601020813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.3787207603455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.8862390518188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.03696393966675, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.1632013320923, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.3855967521667, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.2307195663452, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.70000314712524, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.160481929779, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.5875129699707, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.5564832687378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.75312185287476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0271978378296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.7043237686157, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.6507205963135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 731.5787196159363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.5057654380798, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.6606373786926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.7057681083679, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.0148854255676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.7852735519409, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.0175986289978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.038402557373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.7651171684265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.5705647468567, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.8459167480469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.449761390686, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 731.1942386627197, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.9235215187073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.4508762359619, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.722243309021, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.77120304107666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.40847873687744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.56368112564087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.2615976333618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.3188810348511, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.61120414733887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 480.4851245880127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.5836796760559, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.206392288208, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.9520010948181, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.75119638442993, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.2505569458008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 471.92240715026855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.32512187957764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 478.55615615844727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.8715214729309, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.6545624732971, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.07456731796265, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.14319849014277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.6220769882202, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.56127405166626, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.78272008895874, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.9639992713928, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.707200050354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 475.2280068397522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.41199588775635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.3900828361511, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.04543685913086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 473.68640184402466, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.48640060424805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.52000093460083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.10048484802246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7534456253052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2929553985596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0417609214782, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.243200302124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.2686376571655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.285279750824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.08895874023443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.1867184638977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.8993611335754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1977596282959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4747180938721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.81679582595825, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.6769595146179, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9185547828674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9702391624451, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1036.7870473861694, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.5051140785217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1046.2390422821045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.1932787895203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1046.6519975662231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.6742415428162, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1044.543514251709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.4646401405334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.842565536499, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.3147196769714, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.1494421958923, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.9870395660401, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 470.5614376068115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.566556930542, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.1870393753052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.3379240036011, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.54752016067505, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3619117736816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 476.29087686538696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.52160024642944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.10031938552856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.83615922927856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.50399875640875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2008056640625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 477.0345616340637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 481.93583965301514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.6608033180237, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.1430411338806, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.7654371261596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.9604845046997, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 474.7700810432434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 482.4092745780945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.1110486984253, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5150375366211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.0920014381409, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.6753582954407, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.5708770751953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.36879730224604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.6083173751831, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.5787253379822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.9463958740234, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.89552116394043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3484778404236, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.5164813995361, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.8527979850769, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.0395221710205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.773115158081, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.9414367675781, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.9071979522705, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.21087598800665, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.3262405395508, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.1156787872314, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.7318396568298, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.7003231048584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.7531261444092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.2833642959595, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.9310350418091, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.28383922576904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.6716771125793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.9460773468018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.1279988288879, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.6995258331299, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.0324811935425, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.5681557655334, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.7276787757874, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.55279636383057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5289583206177, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.6558408737183, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.7048015594482, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.1683168411255, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.1206426620483, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.4311962127686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.0340805053711, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.0395178794861, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.4912009239197, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.6211194992065, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.8147192001343, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.0406403541565, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.2803211212158, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.9638409614563, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.4401597976685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.8587193489075, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.3001618385315, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.1924767494202, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.8880033493042, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.6904010772705, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.6627159118652, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.3420805931091, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.2684845924377, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.919517993927, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.5945582389832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.7884798049927, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 645.8612775802612, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.513120174408, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.4740858078003, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.9084801673889, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 909.161434173584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 929.0564870834351, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 918.5683155059814, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.5052795410156, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 920.7291269302368, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.9278326034546, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 918.1777667999268, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 937.9136037826538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.9291195869446, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.4052758216858, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.24128055572515, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.25631999969477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.0483202934265, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.82959604263306, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7376008033752, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.1316819190979, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3908772468567, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.2968006134033, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4148797988892, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.46095943450933, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.2516841888428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.6539192199707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.5595216751099, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.4401602745056, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.4579219818115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.21792125701904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.1166429519653, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1307225227356, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.6961603164673, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.3313570022583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.8124785423279, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9503984451294, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.8972768783569, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.65936040878296, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.0369591712952, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.5923271179199, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.4888033866882, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.776801109314, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.0369563102722, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.0564775466919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.1297578811646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.0991978645325, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.4423990249634, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.1862421035767, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.2233581542969, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.0364861488342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.5759973526001, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.8592066764832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.3948802947998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.9006423950195, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.9043231010437, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.0724849700928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 637.7334475517273, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.29407787323, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.2705583572388, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.9001603126526, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.31055784225464, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.080641746521, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.94592094421387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.3630361557007, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.5889620780945, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.53664445877075, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.438401222229, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.32239818573, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.243679523468, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.7697548866272, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.72816133499146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.63711881637573, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.7100830078125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.54943990707403, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.1915216445923, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.08943939208984, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.40735673904425, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.4511995315552, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.85279655456543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.65903949737543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.3120036125183, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.11631441116333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.84496259689325, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 485.3870391845703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.9044780731201, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.9284749031067, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.5910396575928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.5049619674683, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.3652806282043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.4571189880371, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.682240486145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.1683211326599, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.362557888031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.2540826797486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.7601580619812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.8334436416626, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.5731191635132, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9656019210815, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.2577614784241, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.7577600479126, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 919.0977478027344, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.7552013397217, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 756.3060688972473, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.7388820648193, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 921.1959886550903, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.1860752105713, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.2121620178223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 784.9721670150757, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 912.193922996521, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 881.9087982177734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.1528024673462, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.8446373939514, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.306568145752, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 863.3812856674194, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.2852802276611, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.1214380264282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 919.5694446563721, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 888.2473659515381, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.6396775245667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.0584011077881, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 919.3343925476074, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 862.6158475875854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 769.6737670898438, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.6185617446899, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 916.5529584884644, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 879.6939182281494, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.3945579528809, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 771.7225646972656, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.3254356384277, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 865.9913539886475, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.3937511444092, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.101282119751, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1107.701120376587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.808313369751, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1097.3603105545044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1069.1529560089111, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1097.2671937942505, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1072.7227306365967, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1101.8201541900635, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1068.4824085235596, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.5830397605896, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.7286353111267, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.1070394515991, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.9907169342041, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.9278378486633, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.5656037330627, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.0745630264282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.6241598129272, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.8638386726379, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.1241602897644, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.124321937561, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.6028747558594, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.0137577056885, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.2347211837769, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.9143986701965, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.2072019577026, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.6604838371277, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4764742851257, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.7036843299866, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.5407962799072, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.6921577453613, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.0086326599121, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.0710425376892, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.3737664222717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.4644765853882, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.940477848053, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.0625600814819, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.7523212432861, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 859.0662384033203, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.3638439178467, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 860.406084060669, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.2515215873718, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.1555190086365, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.5910339355469, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 684.4694375991821, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.677282333374, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.7108812332153, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.3068833351135, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.3060822486877, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.1395201683044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.2188830375671, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.4937605857849, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 685.5468845367432, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.8824005126953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.9116764068604, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.9828796386719, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.3017597198486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.6291246414185, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2439.718551635742, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.991039276123, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2439.4761657714844, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.2193632125854, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2454.920015335083, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.6356792449951, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2445.2486419677734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.0816016197205, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1823.936471939087, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1275.231523513794, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1822.3835182189941, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1290.6222486495972, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1830.7166481018066, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1270.856966972351, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1848.1737613677979, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1281.9995260238647, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1836.9475078582764, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1274.169602394104, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1849.0423965454102, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1275.0219249725342, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1833.1099224090576, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1269.8171138763428, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1852.1881675720215, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1290.0630378723145, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1864.9940872192383, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.131685256958, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1870.2415943145752, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1160.5334424972534, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1865.4993438720703, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.476643562317, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1875.291519165039, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1149.147367477417, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7354.446449279785, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 950.5054426193237, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7384.832649230957, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.2988796234131, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7363.726768493652, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.7424144744873, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7391.756782531738, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 24, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 941.5540885925293, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.32016372680664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.11040592193604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.0401611328125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.5196776390076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.8401570320129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.84799814224243, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.2270412445068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.9315228462219, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.3454418182373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.49183893203735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.6057605743408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.7560033798217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2574367523193, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5294451713562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.8528022766113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5372838973999, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.74928188323975, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.9041647911072, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.9552035331726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.1254391670227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.1179265975952, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.2208008766175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.5699214935303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2286367416382, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.62671661376953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.0147213935852, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.85567903518677, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.6723184585571, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9484839439392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1278438568115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.3892812728882, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.188485622406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.2982397079468, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.54959869384766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.17919969558716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.74176311492926, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.5380735397339, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.51984167098993, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8553576469421, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.9267230033875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.1755218505859, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.81024217605585, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.7123236656189, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.875039100647, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2884831428528, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.2454414367675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.1627202033997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.4183979034424, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.1575999259949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.1905603408814, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.01471614837646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9251136779785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6238441467285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5215997695923, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9964814186096, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.9159989356995, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.35424184799194, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.0009579658509, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.7408013343811, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4070358276367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9128007888794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.6710343360901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.1092796325684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.8747172355652, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2273578643799, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.5648007392883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.8588833808899, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.5652747154236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.8296022415161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.0372776985168, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.475513458252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.2584013938904, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.0452828407288, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.3939199447632, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.0787215232849, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 721.7812728881836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.9521579742432, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.477276802063, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.2215991020203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.292317867279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.0625629425049, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.0491199493408, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.8100819587708, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 724.018075466156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.4343996047974, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.538562297821, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.6363186836243, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 754.1576051712036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.7463998794556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1646418571472, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.2003178596497, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.731041431427, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.8990359306335, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.5300765037537, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.1078395843506, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.646402835846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.4187211990356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.0796785354614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.3464002609253, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.09152030944824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.38496589660645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.57263469696045, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.72128200531006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.10447835922247, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5862398147583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.30191564559937, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 483.07008266448975, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.86767768859863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.28704071044916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.4027199745178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.2590432167053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.43952369689936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1942372322083, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.5025601387024, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.2945647239685, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2728023529053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3055996894836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.20784187316895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.1124801635742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.3400011062622, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.5700721740723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.28768253326416, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.0075206756592, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.4697632789612, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.1078481674194, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.4284815788269, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 486.7743968963623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.79567813873297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.62351751327515, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.98047828674316, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.0241584777832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2131261825562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.8041577339172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.40640163421637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.745762348175, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.9233565330505, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.83440113067627, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.34048080444336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.1209607124329, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.0296006202698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.07760620117193, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.8078370094299, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1799969673157, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.5780844688416, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.21727657318115, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.5143957138062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6316738128662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9318385124207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.58543443679804, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.25919723510737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2894358634949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6953601837158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.1419196128845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.1596798896789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.9310331344604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.1428799629211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.0884785652161, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.76831674575806, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.9745635986328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.4521622657776, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.3798413276672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.4083228111267, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.2011179924011, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.2931213378906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.1755175590515, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.6528024673462, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.286075592041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.6196784973145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.4230456352234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.793598651886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.9734449386597, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.7540812492371, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.3628854751587, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.6414403915405, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.1048035621643, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.8438353538513, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.27327823638916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.58848381042475, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.82640409469604, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.06848764419556, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9545564651489, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.38623762130743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.4519987106323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.9772815704345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.33359813690186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.16911983489985, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.24559783935547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.1257562637329, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.2673554420471, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.1748833656311, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.07024002075195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.60240077972406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.18624353408813, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.2841544151306, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.4100818634033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.8857564926147, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9220790863037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.7843179702759, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.0251159667969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1465630531311, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.9468765258789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.80992221832275, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.3422431945801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3614430427551, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.9355239868164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.67568159103394, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.02335500717163, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.37167978286743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.1481552124023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.0865626335144, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.0979218482971, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.99455881118774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.10431814193726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.04416322708136, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.1798396110535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5248045921326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2113628387451, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.9947166442871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.09519815444946, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.31343841552734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.5028753280639, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.053918838501, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8446383476257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.26303911209106, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.85887575149536, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.3568015098572, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.66543912887573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5147185325623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.9619235992432, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 489.60015535354614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.1684837341309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0785608291626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.3945598602295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.772322177887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.2844886779785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.1121644973755, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 874.3243217468262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.2979230880737, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 879.0352010726929, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.9257636070251, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.3798432350158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.85663414001465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.2291193008422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.6958384513855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 484.5795202255249, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.3785557746887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.8054389953613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.8051190376282, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.4955177307129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.29247665405273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.2675223350525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.938720703125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.7769618034363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.69119930267334, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9878416061402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.5391993522644, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.86832189559937, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.4361548423767, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.030080318451, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.3260769844056, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.4603247642517, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.88207721710205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.8081603050232, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 495.141122341156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.9615979194641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.59583663940435, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.9275231361389, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5115184783936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.274405002594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.66303873062134, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.7772827148438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.26784181594854, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.76816272735596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.67615461349493, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.55504560470575, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0500841140747, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.5083198547363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.53760051727295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.6824059486389, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.8337631225586, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.7332787513733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.7766456604004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.809280872345, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.06032085418707, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.8980784416199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5847959518433, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.1627125740051, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.6747250556946, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.2511959075928, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.84575748443604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.9225640296936, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.25856161117554, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.6576051712036, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4409627914429, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.0369558334351, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2860794067383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.2347197532654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.94143724441534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.21647977828974, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.85231637954706, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.7920064926147, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.40848159790045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.9547171592712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.7339172363281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1564769744873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.6196751594544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.6675143241882, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.7817568778992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.7488050460815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7116847038269, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.1838459968567, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 663.4439992904663, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2912030220032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.82543420791626, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8752059936523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 637.3879957199097, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.2328033447266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.0777540206909, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.4710426330566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.1694407463074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8108806610107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1764750480652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.8641562461853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.2876825332642, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.4054408073425, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.2408022880554, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.1108884811401, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.8824014663696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.7543940544128, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.4923195838928, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.731360912323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.9542369842529, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.3523230552673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.0388770103455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.1299214363098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.7744045257568, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.7681603431702, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.3956661224365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 841.4571237564087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.0318369865417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.1212739944458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 779.8830389976501, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.6016001701355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.8598299026489, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.2447996139526, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.7948808670044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.3683214187622, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.7462377548218, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.1515188217163, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 814.9081563949585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 842.8244781494141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.7712001800537, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 749.1935992240906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 775.7566404342651, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.9568033218384, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 823.5291147232056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 846.8979120254517, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.6675186157227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.6992030143738, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.5499215126038, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.30191802978516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.3907198905945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.8447952270508, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.89888191223145, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.44207763671875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 496.51328325271606, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.1712055206299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.742880821228, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.07775831222534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.4788784980774, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.0584034919739, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.96368169784546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.3220782279968, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.06127643585205, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.7980737686157, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4852824211121, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.5212788581848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.2966365814209, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.9958391189575, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.95136404037476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.4340839385987, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.1915183067321, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.3401656150818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.1358428001404, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.3303999900818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.58319902420044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 488.2697629928589, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.9320030212402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.13200187683105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.73648214340204, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.340163230896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8992013931274, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.681282043457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.4419202804565, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.1244812011719, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.9852862358093, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.8313555717468, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.1179184913636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.9057631492615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.0524706840515, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.4046406745911, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1921591758728, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.2057566642761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.1492824554443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.0143957138062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.1697616577148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.709755897522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1302428245544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.1910357475281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.775363445282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.4731216430664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.3780832290649, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.4747142791748, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.5912051200867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.9903993606567, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4051179885864, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.5265560150146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.5177640914917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.7726430892944, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.5479989051819, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.1587152481079, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.58608627319336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.2886366844177, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.584801197052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.1860818862915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.9603180885315, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.6439986228943, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.8236794471741, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.1798434257507, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.999837398529, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.2233567237854, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.9339208602905, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.9841628074646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.1315212249756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.271044254303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.8460793495178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.4313569068909, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.7401585578918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.2086386680603, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.2854375839233, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.8731212615967, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.9388794898987, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.63968276977545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.3755125999451, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.16239833831787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 490.0152039527893, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.9955177307129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.3323221206665, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.8127965927124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.4606394767761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.35039854049677, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.2704014778137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.3905658721923, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.89888525009155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.7204756736755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2705669403076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.3353614807129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.965922832489, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.09680032730097, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2867250442505, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.04431581497187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.80560302734375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3481631278992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.3953580856323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 554.5215964317322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 492.0185613632202, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0947160720826, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2012805938721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.8695993423462, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 491.7214369773865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.368001461029, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.8206462860107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.92592048645025, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.1172814369202, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6531195640564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.2099194526672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.4550418853759, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.7300815582275, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2699198722839, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.2220783233643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.82479381561274, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.8185601234436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.5329594612122, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.9403228759766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.7735958099365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.0487985610962, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2185597419739, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.3185596466064, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1062.3534297943115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.0342388153076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1062.4356842041016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.0468873977661, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1064.6155261993408, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.8247957229614, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1069.221749305725, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.9679985046387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.5446405410766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2796778678894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.528639793396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.5260772705078, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 487.5854444503784, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.81903886795044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.77504158020014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.271999835968, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.0537638664246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.637282371521, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 493.4011220932007, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.0612750053406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.4062356948853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1631984710693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.0724830627441, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.7675175666809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.7327995300293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.287043094635, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.4600005149841, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.4291191101074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.7305564880371, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.2435231208801, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 494.0336012840271, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.5094399452209, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.7393579483032, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4918365478516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.7483253479004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2278351783752, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.4897546768188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.170557975769, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.619366645813, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.4067230224609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.6619181632996, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1164793968201, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.966881275177, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.1822395324707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.7065539360046, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.0073504447937, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.9081635475159, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.2500791549683, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.5897583961487, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.0678377151489, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.0827221870422, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.2992005348206, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.1617631912231, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.2819166183472, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.3295965194702, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.1264038085938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.069926738739, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7318429946899, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8723258972168, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.9729585647583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.588321685791, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.7486357688904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.0950417518616, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.8510394096375, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.8887991905212, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6012873649597, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.9747171401978, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.6312017440796, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.4865670204163, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.2233624458313, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.2689609527588, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.3409585952759, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.3220806121826, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.7095928192139, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.0620818138123, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.854875087738, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.9275240898132, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.8964757919312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.2609572410583, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.1161527633667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.1668815612793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.7356848716736, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.5388803482056, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.3942399024963, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.5537557601929, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.7444744110107, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.5902457237244, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.7891201972961, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 748.4006404876709, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.0875182151794, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.783362865448, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.7100787162781, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.2998385429382, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.5044736862183, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.2467193603516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.5801577568054, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.1363124847412, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 976.3267183303833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.2817678451538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.0727949142456, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 963.3452892303467, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 977.4545621871948, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 963.6254358291626, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.1567897796631, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.0110397338867, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.50240087509155, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.357916355133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6998448371887, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.6371216773987, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.5580787658691, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.1281642913818, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.666081905365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.3694443702698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.26863908767706, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.442883014679, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.0769553184509, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.617434501648, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.4894433021545, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.6171169281006, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.7303972244263, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7929615974426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.7809543609619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.3731241226196, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.0220808982849, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.2668871879578, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3467154502869, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.1715245246887, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.7403240203857, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7563166618347, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.7993555068969, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.1385631561279, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.3073592185974, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.0360064506531, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1366353034973, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.89808177948, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.785126209259, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.6876773834229, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.4948830604553, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 722.4427199363708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.443835735321, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.7939209938049, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.6385588645935, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.3308773040771, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.5303983688354, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.4428768157959, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.7195200920105, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 727.3657631874084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.9657597541809, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.6217575073242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.1848068237305, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 727.3036789894104, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.1179213523865, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.55231761932373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.3280005455017, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.1881637573242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.5577549934387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.58288097381586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 498.5655951499939, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.5694375038147, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7092761993408, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.1460728645325, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.7044768333435, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4916839599609, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.3792014122009, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.02255582809454, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0334372520447, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.7384028434754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.88031673431396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.9892807006836, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.2201590538025, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.9388790130615, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.812961101532, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.6601629257203, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.0335946083069, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.3079957962036, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.905601978302, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.0860795974731, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.2968029975891, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.0737609863281, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.2716827392578, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4595184326172, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.4486441612244, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.6582417488098, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.4278411865234, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.4052748680115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.5606355667114, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.9928016662598, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.3808054924011, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4820823669434, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5819201469421, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.3048024177551, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.6622395515442, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.4428758621216, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 921.077766418457, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 799.976315498352, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.2801599502563, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.2292804718018, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 899.2993545532227, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.6057572364807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.8323135375977, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.9473638534546, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 916.7160034179688, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.7960052490234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 813.1622362136841, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.4372854232788, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 905.6481552124023, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.6884803771973, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.5590372085571, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.9599952697754, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 921.845760345459, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 792.2380781173706, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 806.2897539138794, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 934.5491170883179, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 911.2388849258423, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 802.2751998901367, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.4224004745483, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 933.939037322998, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.9467124938965, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 804.8980808258057, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.1740884780884, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 932.5601577758789, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 905.476803779602, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.7479944229126, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.7107224464417, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1155.146722793579, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1113.1217670440674, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1161.7456102371216, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1131.181116104126, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1160.6601572036743, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1122.8836727142334, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1158.5030460357666, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1122.039680480957, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.939359664917, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.340479850769, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 594.647843837738, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.4166374206543, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.0840015411377, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.9683218002319, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.1275210380554, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.7796859741211, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.9814434051514, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.3476805686951, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.1798391342163, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.1336011886597, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.9857606887817, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.5377550125122, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.1996831893921, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.6595230102539, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.9598388671875, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.800802230835, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.5363230705261, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.1683177947998, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.794882774353, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.5908823013306, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.1895971298218, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.7640008926392, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 878.3763217926025, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.5832018852234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 878.9993667602539, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.490394115448, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 871.176962852478, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 746.7972755432129, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 881.5416049957275, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.1699228286743, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.9774374961853, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.4067182540894, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 699.2614388465881, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.9470381736755, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.6521668434143, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.412965297699, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 697.3172760009766, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.633599281311, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.1723170280457, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.7616000175476, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 697.5601625442505, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 549.8230409622192, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 773.2115173339844, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.4091215133667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 699.9127984046936, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.9006400108337, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2502.188491821289, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.8932809829712, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2512.182397842407, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.8836770057678, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2522.870569229126, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.6814341545105, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2532.1879863739014, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.7032008171082, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1920.2425479888916, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1325.732479095459, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1939.4275283813477, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1349.6481609344482, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1926.4311695098877, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1330.7275199890137, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1933.9334392547607, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1356.2022304534912, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1929.1007709503174, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1340.8911895751953, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1943.335371017456, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1348.0604839324951, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1934.2028903961182, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1340.9367942810059, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1940.1208019256592, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1357.8107213974, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1880.2817630767822, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1173.8462400436401, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1879.7217464447021, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1180.1729536056519, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1884.256992340088, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1178.47008228302, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1885.5079936981201, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1177.8739166259766, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7463.56143951416, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 968.3332777023315, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7466.946144104004, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 971.491208076477, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7471.216354370117, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.8998441696167, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7484.053115844727, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 48, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 973.820629119873, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.0603189468383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.5432019233704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7871990203857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.6755204200745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4220843315125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.0828757286072, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5691194534302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.6315202713013, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.6953611373901, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3081579208374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.6657557487488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.8080010414124, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.2745566368103, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.2753643989563, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.5737566947937, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.0305614471436, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.3521590232849, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1528029441833, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3487992286682, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.6441602706909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.7936010360718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.6875176429749, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.4356846809387, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.2316741943359, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.8732786178589, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.5558385848999, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8894367218018, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.37440204620367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.27423620224, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.6806406974792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.393274307251, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.640962600708, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.5964832305908, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1604804992676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.3163232803345, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.0208044052124, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.7168035507202, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.5582423210144, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.8864040374756, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.0966382026672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.8177614212036, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1848006248474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.7328038215637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.9067215919495, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.9183993339539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.1769638061523, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.9708814620972, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.9177570343018, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.1201586723327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.42671489715576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.002076625824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.7334394454956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.0566415786743, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.1297564506531, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.8737630844116, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.774561882019, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.2361621856689, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.37807846069336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.5304002761841, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.5033626556396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.346079826355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.6851277351379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.3148803710938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.1604771614075, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.3432030677795, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.5134401321411, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.462085723877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.9644742012024, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.9464030265808, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.3315148353577, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 846.0407876968384, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 872.7017545700073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.519838809967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.0739245414734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.843514919281, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.9647979736328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.651035785675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.9764790534973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.2883253097534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 869.1459131240845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.9611196517944, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.6940813064575, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.0358452796936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.5174417495728, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.4014358520508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.1568007469177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 840.9540748596191, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 864.4908666610718, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.457597732544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.672164440155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 799.4681644439697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 812.3716855049133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.0638403892517, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.0976028442383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 849.3897533416748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 863.1347179412842, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.1681609153748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0161519050598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.26015901565546, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.41263484954834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.2406373023987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.15791511535645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.2884788513184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.38400077819824, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.9025583267212, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.8307199478149, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.51039934158325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.6041617393494, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.0499234199524, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.53055620193476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7684774398804, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.0728068351745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.6985578536987, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.1443204879761, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.5921635627747, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.7665586471558, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.1644740104675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.7950429916381, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.9515175819397, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.8393659591675, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1383996009827, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1119995117188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.6364817619323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.8902368545533, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.31135797500616, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.18607902526855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.68384075164795, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.67280292510986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.08863830566406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1692805290222, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.8734407424927, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.3118405342102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.4140782356262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.8049626350403, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.9657578468323, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.3254432678223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.73616027832037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7931156158447, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.1039929389954, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.989116191864, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.85328197479254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4505615234375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.7296023368835, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.8580803871155, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.11791610717773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.1356792449951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.4089622497559, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.618236541748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.25583887100225, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.5779175758361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.7707147598267, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.7715172767639, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.07519769668573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.34672498703003, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.9995188713074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.6859097480774, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.08863830566406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.1902441978454, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.9132761955261, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.1409616470337, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 581.4143967628479, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.8926396369934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.3665609359741, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.7492804527283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.8492832183838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 703.8615989685059, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.4248023033142, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.3923192024231, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.2492814064026, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.6583905220032, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.0414423942566, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.7633609771729, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.3524785041809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.2523250579834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.0996813774109, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.5651183128357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.48128175735474, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2774386405945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.0020785331726, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1777606010437, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.571834564209, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.6465578079224, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.7819232940673, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.70928049087524, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.5284829139709, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.9577641487122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.9107189178467, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.6623873710632, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.5209593772888, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.2940773963928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.650399684906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.7259216308594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1737585067749, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.3169584274292, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.0273613929749, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.1987223625183, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.8983969688416, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2604737281799, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.61439895629877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.5743975639343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.98480129241943, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.410722732544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1321592330933, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7683200836182, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.7859172821045, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2052798271179, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.9408025741578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.12415552139277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.6259245872498, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.5412817001343, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.5956816673279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.821605682373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.3931188583374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.1377568244934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.2900881767273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1286382675171, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.0137605667114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.1913628578186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.2851228713989, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.5982403755188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6113533973694, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.2795233726501, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 543.7131214141846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.86400222778315, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.9575972557068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.0246353149414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.5750379562378, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.9230403900146, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.5692782402039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.7980813980103, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.8292784690857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.8326387405396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 911.382246017456, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.5283236503601, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 916.3305568695068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.5051231384277, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 911.9937562942505, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.3777542114258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 917.0539140701294, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.0640029907227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.8227243423462, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3945631980896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.1113624572754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.2366423606872, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.46063899993896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.7503967285156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.4950385093689, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.5180807113647, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.6387186050415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.33759784698486, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.8115224838257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.2659134864807, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.0312042236328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.320963382721, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.0151944160461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.80751609802246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.02127790451044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.5423974990845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.9673624038696, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2340755462646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.0611200332642, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.78319644927984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.26656055450434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.9806394577027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.4031987190247, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.402717590332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.9006357192993, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.5092749595642, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.2947187423706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.6726412773132, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.2830410003662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.7892818450928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.29776811599737, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 516.2628793716431, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.9375996589661, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.7718396186829, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.5196785926819, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.3888006210327, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2310423851013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.896800994873, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9084792137146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.3180751800537, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.5859222412109, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4838376045227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.6188836097717, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.336323261261, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.0769605636597, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.9233565330505, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.6889595985412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.8731160163879, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.733606338501, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.0943970680237, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.3582458496094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.047679901123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.0582389831543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.2377610206604, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.9881567955017, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.3598389625549, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.9878396987915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.3865571022034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.5782384872437, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2952008247375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.5272002220154, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.2681570053101, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.0187201499939, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.5612816810608, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.3899164199829, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.1167988777161, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.3593559265137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.8574390411377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.5643219947815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 681.8384051322937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9324798583984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.8036775588989, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.9542379379272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.4033517837524, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.6160001754761, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.5244822502136, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.4601588249207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.5915231704712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.3617601394653, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2588787078857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.1147179603577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 658.4596800804138, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.3079977035522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.7511978149414, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.2209610939026, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.9425644874573, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.1087989807129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.9316835403442, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.9375991821289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 663.3572793006897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.4929623603821, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.7884798049927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.6187205314636, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.3569579124451, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.7187194824219, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 841.4267301559448, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 856.6420888900757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.2080006599426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 767.4051213264465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 793.3054423332214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.6800031661987, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 842.3431968688965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 855.0816011428833, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.937593460083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.4742398262024, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 795.1915144920349, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.3919949531555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 846.965274810791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 867.673282623291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.7649598121643, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 764.3028783798218, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 797.3545622825623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.5052742958069, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 848.290228843689, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 857.3958396911621, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.0340809822083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.4139151573181, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 795.2583932876587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.8270406723022, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.01232099533075, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.79759502410894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.6967992782593, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 508.68416070938105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.111361503601, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.3140811920166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.7512021064758, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 501.4587211608886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.7281560897827, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.1086401939392, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.0422358512878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.16959285736084, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.19583940505987, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.6660771369934, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.3158383369446, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.3355212211609, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.99631547927856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.6396827697754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.3582434654236, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 510.6889629364013, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.9361605644226, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.14399766922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.9347257614136, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9889621734619, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.6281623840332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 499.8576021194458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.6638445854187, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.976957321167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.1071991920472, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.9208083152771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.4289622306824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.3260803222656, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.1131181716919, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.492965221405, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 570.710563659668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.795521736145, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.7111916542053, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.9142355918884, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.6327929496765, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.7796816825867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.4331192970276, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.4910373687744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.3044838905334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.5264058113098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.513918876648, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.952793598175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.9996786117554, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.3222427368164, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9145617485046, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.5529561042786, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.8420767784119, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.1195178031921, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.7169585227966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.4046363830566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.990243434906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.8153614997864, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.3097653388977, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.0239996910095, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.7958421707153, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.9412751197815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.2214398384094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3345670700073, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.67520570755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.9497632980347, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 760.5332803726196, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.3624029159546, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.7915177345276, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.4875197410583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.448320388794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.3750386238098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.7713632583618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.5176048278809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.4043173789978, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.0036773681641, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.0135998725891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.2551989555359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.0670385360718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 677.7351975440979, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.5424003601074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.7137637138367, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 502.2350430488587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.312798500061, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.0489640235901, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 507.817120552063, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 500.7508826255799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.2951946258545, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.4185585975647, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.182719707489, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.45024108886713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.4958367347717, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.097442150116, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.6062393188477, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.6286401748657, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.4654388427734, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.8657531738281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1257605552673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.3217673301696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 522.2273635864258, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.8561539649963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.3079986572266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 505.5998420715332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.0371170043945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.5483183860779, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.5024003982544, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.47920083999634, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.4395189285278, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.3895998001099, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6382393836975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.0168023109436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.5456047058105, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.6683201789856, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.9942359924316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.9411172866821, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.6548810005188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.1296005249023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.1164746284485, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.661600112915, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.5753617286682, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 536.6084814071655, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.3305597305298, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.0164804458618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.7988777160645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.0115189552307, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.0311970710754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.8952045440674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.0660743713379, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.7788844108582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1071.846718788147, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.9227204322815, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1073.27054977417, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.7475185394287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1077.9750394821167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.9838395118713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1082.78799533844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.2513628005981, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.9088034629822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.5451216697693, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.7721562385559, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.4798412322998, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 497.4169611930847, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 504.4590377807617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.9686427116394, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.2668771743774, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.9203171730042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 537.9015970230103, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 503.9767956733704, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.41599893569946, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 520.9601616859436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.8236804008484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.4391965866089, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.3846373558044, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.4899158477784, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.2964816093445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.1459202766418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5657544136047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.7374496459961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.9068813323975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 506.1609625816346, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.8931188583374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.3432011604309, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.2140784263611, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.2542414665222, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 527.3694443702698, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.9828805923462, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.747838973999, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.6524796485901, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 528.8112020492554, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.0780849456787, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.8292856216431, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.2932806015015, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.5326428413391, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.5865559577942, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.7563209533691, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.31951379776, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.3539175987244, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.2407975196838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 529.070234298706, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.3153643608093, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.4620747566223, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.9492840766907, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.7636799812317, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.2308721542358, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.5460758209229, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.0940809249878, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.8001565933228, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.2923202514648, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.1092820167542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.316162109375, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.8847994804382, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.1929640769958, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.6766428947449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.2127981185913, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.9396743774414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.9163198471069, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.7244849205017, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.1000027656555, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.6432046890259, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.3168048858643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 697.8276777267456, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.5575909614563, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 736.1916756629944, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.3156781196594, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.1681618690491, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 760.9014439582825, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.5169653892517, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.6296038627625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.6849594116211, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.6320023536682, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.751838684082, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.7249598503113, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.1246409416199, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.2083191871643, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.6545634269714, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.7238349914551, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.9833588600159, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 766.46000623703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.9748768806458, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.6273593902588, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 736.2687969207764, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.9307174682617, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.6622433662415, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 771.6876792907715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.9667205810547, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 985.7188749313354, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1001.572003364563, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 986.2617588043213, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1001.8129587173461, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 983.9632034301758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1008.9296007156372, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 982.8913688659668, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1008.8852834701538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.2398409843445, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 513.7081623077393, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.8484811782837, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5873599052429, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.8672013282776, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.153920173645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.7104015350342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.2427201271057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.4833607673645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.2651166915894, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.2048015594482, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.9199995994568, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.8700737953186, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.6121587753296, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.2052764892578, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.724956035614, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.2187190055847, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.2000026702881, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.2348818778992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.7931289672852, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.1852769851685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.7321586608887, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.602560043335, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.2657580375671, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.541437625885, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.257764339447, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.159679889679, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.5339164733887, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.5971202850342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 526.7740797996521, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.2430362701416, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.6649675369263, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.5172824859619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.4684772491455, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 742.6668810844421, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.143358707428, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.9372806549072, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.7983999252319, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.4020833969116, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.2817621231079, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.4838371276855, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.606876373291, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 744.8448038101196, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.4238381385803, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.5772738456726, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.9470405578613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.1017565727234, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.5865573883057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.616641998291, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.2857613563538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 514.0148758888245, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.3879985809326, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 521.8740773200989, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.887363910675, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.52383756637573, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 530.3430366516113, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 517.1487998962402, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 509.8084759712219, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 523.0822372436523, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.7280020713806, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.91200256347656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.7006411552429, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 519.2462372779846, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.051203250885, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 524.9691200256348, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 515.6312012672424, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 511.7724823951721, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 533.3990454673767, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 518.6767983436584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.2070407867432, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 525.5502390861511, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 512.9102444648743, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.374400138855, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.2923183441162, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.8424010276794, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.711042881012, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.529757976532, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.2929558753967, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 605.2484798431396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.4345560073853, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.8401598930359, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 531.5531277656555, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.2745552062988, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 532.687361240387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.0030460357666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 534.8001623153687, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 602.0979166030884, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 535.6596803665161, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.8708782196045, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 933.3428907394409, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 806.6062498092651, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.8475284576416, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.0080099105835, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 920.7534551620483, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 810.7260751724243, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 826.9105577468872, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.6179246902466, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 944.681601524353, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 816.4420700073242, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 833.6480093002319, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.1675186157227, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.2828874588013, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 812.500467300415, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 829.5297622680664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 943.0897617340088, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 946.083517074585, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 818.0067253112793, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 836.5208101272583, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 940.1592016220093, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 928.2118368148804, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.1395235061646, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 823.1782293319702, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 941.5383958816528, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 949.1524744033813, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 812.5603246688843, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.959997177124, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.2051267623901, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 925.173749923706, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 809.9843168258667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 828.2486486434937, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1178.4764766693115, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1130.32320022583, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1186.4342308044434, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1136.905426979065, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1182.4462461471558, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1138.4574460983276, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1184.6959972381592, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1137.0959901809692, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.3907208442688, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.6041626930237, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.4972853660583, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.6107177734375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.5347218513489, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.3566408157349, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 686.3699221611023, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.3311982154846, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.8641610145569, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.5596785545349, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.007040977478, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.1008014678955, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.029914855957, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 653.5635209083557, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.5428824424744, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.0348806381226, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 631.082558631897, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.2852754592896, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 686.1870408058167, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.2113585472107, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.5435152053833, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.8268756866455, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.6777606010437, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.5371189117432, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 883.8755130767822, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 764.1984009742737, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 895.3737640380859, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 767.0913600921631, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 887.8481578826904, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.7481570243835, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 894.3195199966431, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 765.0977635383606, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.8739252090454, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.1700768470764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 707.6553606987, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.7859153747559, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 780.9352016448975, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.0686411857605, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.4377593994141, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.7073659896851, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 783.101761341095, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.9715209007263, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.5652809143066, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.4166445732117, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 790.0716853141785, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.2886385917664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.1583938598633, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.3241620063782, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2521.7492961883545, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.7732825279236, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2530.7081508636475, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.0651245117188, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.6868801116943, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.168155670166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2553.4073638916016, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.295684337616, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1943.988962173462, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1345.328164100647, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1949.3907451629639, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1361.799349784851, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1949.2251110076904, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1346.7343950271606, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1955.9214305877686, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1368.5054445266724, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1952.2484683990479, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1354.713110923767, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1957.517900466919, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1372.2995233535767, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1952.1230792999268, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1352.1305513381958, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1958.2923316955566, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1372.8006410598755, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1865.508975982666, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1185.5326461791992, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1865.8688163757324, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1190.4919862747192, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1867.5232028961182, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1188.845591545105, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1871.9436836242676, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1190.1843166351318, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7485.676460266113, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 976.5776109695435, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7486.932640075684, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 971.8083143234253, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7492.872543334961, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 970.3041553497314, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7513.126907348633, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 96, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 977.3367929458618, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 705.0385522842407, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.8475217819214, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.2756776809692, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.4318375587463, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1097.4406433105469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.076476097107, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1042.0361518859863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1065.3776025772095, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.7980847358704, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.800323009491, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.2513580322266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.0417566299438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1086.9603252410889, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.87824344635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1068.0244779586792, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1090.270881652832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.4615964889526, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.2632012367249, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 716.8827199935913, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.6560020446777, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1094.0374422073364, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1058.231987953186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1060.8553504943848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1104.711675643921, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 689.6995186805725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.4683265686035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.254876613617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.7734365463257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1090.0416040420532, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.1808004379272, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1068.3619260787964, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1093.8209676742554, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.2345566749573, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 759.2289614677429, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 838.7801551818848, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 971.5772771835327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1048.3521604537964, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1026.6931247711182, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1091.0323190689087, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1234.908151626587, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.1403198242188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.4292802810669, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 837.9812812805176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 974.6336030960083, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1048.88032913208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.3462371826172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1093.2025575637817, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1229.9054336547852, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 721.9833564758301, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 757.1385598182678, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 835.3201770782471, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 973.6555194854736, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1046.0542345046997, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1024.1334390640259, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1088.440146446228, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1228.6219310760498, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.4796800613403, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 752.0143985748291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 836.3923168182373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 965.1556777954102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1049.1747093200684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1026.3545608520508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1084.7732782363892, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1237.6918363571167, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 992.0294380187988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1112.4440050125122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1639.2615985870361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1693.176622390747, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1284.312801361084, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1281.3150358200073, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1837.6454257965088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1877.070083618164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1000.7039928436279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1105.8030366897583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1652.9142379760742, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1679.1923236846924, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1287.8228902816772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1284.1814374923706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1829.1524696350098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1878.6201763153076, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1002.0670366287233, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1107.559208869934, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1646.8219089508057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1691.1127853393555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1302.864327430725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1271.6780757904053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1829.8835182189941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1877.126865386963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1010.4499101638793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1111.6452836990356, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1626.6140604019165, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1701.1876964569092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1311.0716772079468, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1289.9491214752197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1828.4331321716309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1878.756332397461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.7854385375977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.6414394378662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.7289657592773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 723.1070423126221, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.9587235450745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.6572847366333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.9707188606262, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 726.4900779724121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.7484803199768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 646.5270400047302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.261438369751, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.9302401542664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.4255986213684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 668.6576008796692, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 683.5809588432312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.5174403190613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.8935985565186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.2814407348633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.9567999839783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 722.4324798583984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.4417643547058, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.6425614356995, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 683.450882434845, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.0975980758667, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.4260764122009, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 644.4814395904541, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 664.8704028129578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.3076801300049, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.0713586807251, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.9750428199768, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.8915209770203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.9763164520264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.6404790878296, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.9841585159302, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 966.28737449646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 973.398232460022, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 714.9844861030579, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.8084750175476, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 988.4087991714478, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.3472099304199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.8305583000183, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.2603168487549, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.8867177963257, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 969.8819208145142, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.8454384803772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 750.9564805030823, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 962.2441625595093, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1000.7641649246215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.2964754104614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.5947179794312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 935.3392028808594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 962.4923229217529, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 718.6081647872925, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 748.2320046424866, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 962.2454452514648, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1006.4216041564941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.0060791969299, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 743.2631993293762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 923.5367965698242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 965.030722618103, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.9232001304626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.3897614479065, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 961.1070442199707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 990.4635190963745, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 976.5908813476562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1307.876968383789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 854.9668741226196, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1143.5702323913574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.4764785766602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1294.7480010986328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 858.624005317688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1150.023045539856, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.3345623016357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1303.1838464736938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 857.4694442749023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1149.169921875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 959.4745588302612, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1308.8129568099976, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 848.7980842590332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1142.75887966156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.397759437561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.963041305542, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.9703960418701, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 751.7083191871643, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.1889634132385, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.5752000808716, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 656.2022399902344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.457441329956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.4924812316895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.1862373352051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.4017615318298, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.4884757995605, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.5337543487549, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.0561580657959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.9014372825623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.5244817733765, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.3443269729614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.3439984321594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.4753613471985, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.5897603034973, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.072639465332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.4684782028198, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 669.6260833740234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 741.2465620040894, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 660.7089614868164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.0382509231567, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.1743969917297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.7763223648071, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.905125617981, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.6401629447937, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.8627166748047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 731.5620803833008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.9873633384705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 861.725435256958, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 869.6388721466064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 696.3446378707886, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 786.9830417633057, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 794.2708778381348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.727518081665, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 837.8744029998779, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 824.7273540496826, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.3700828552246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 771.9555163383484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 768.9867234230042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.4281620979309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 836.9027185440063, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 827.0968055725098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 698.0323195457458, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 771.158881187439, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.3134398460388, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.259684085846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 827.6155185699463, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.9415969848633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.2118453979492, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 769.3417620658875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 768.8142418861389, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1696.726245880127, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1095.3347253799438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1690.3497505187988, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1012.583203315735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1687.821445465088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1016.8332862854004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1680.4811096191406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1016.026406288147, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.8689570426941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 718.7489604949951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.7094402313232, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.6454372406006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.6563220024109, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.649760723114, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.165602684021, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.4902367591858, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.8849611282349, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 676.0153603553772, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.7264060974121, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.930558681488, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.1463971138, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.0579223632812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.0336036682129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 678.7243223190308, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 679.3604731559753, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.7030358314514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.768479347229, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.6012787818909, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.1963195800781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.5147228240967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.9017601013184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.2670373916626, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 908.1606340408325, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 856.5286493301392, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 879.2572736740112, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.7097587585449, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 881.1470413208008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 816.742889881134, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 877.0390462875366, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 809.639356136322, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.5998406410217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.9131183624268, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.7036828994751, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.5089664459229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 802.1657562255859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 722.7801632881165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 734.0424060821533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.6350388526917, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.8772802352905, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.1888031959534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.6971211433411, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.5595231056213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 783.0963206291199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 712.9271984100342, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 732.0092821121216, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.9528021812439, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.4572839736938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 592.0369625091553, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.7460803985596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 657.8643155097961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.1459245681763, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 717.2678399085999, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 731.0860800743103, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 758.3113598823547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.0057597160339, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.3947229385376, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.6780800819397, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.7806429862976, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 782.7446389198303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 709.9985599517822, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.7785639762878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 762.8047943115234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.6348867416382, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.4726419448853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 736.4572787284851, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1042.372169494629, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 782.5702381134033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 728.8771224021912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 830.9343957901001, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1109.3494319915771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.0964789390564, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 659.7124767303467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.6393599510193, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1027.21200466156, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 782.3151993751526, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.1779174804688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.4380836486816, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1104.9745559692383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.8883213996887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.8678402900696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 740.7575988769531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1022.3705530166627, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.6702418327332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.3361625671387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 802.0587277412415, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1118.440637588501, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.4745554924011, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 651.5785574913025, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 725.1488018035889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1038.227367401123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 781.5470385551453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 729.268159866333, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 806.988480091095, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1101.4806365966797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 950.0555229187012, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1310.5729627609253, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1369.9715328216553, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 958.7740802764893, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1227.5124883651733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1264.5772743225098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 954.484486579895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1321.5358352661133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1373.0083227157593, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 952.434720993042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.0912103652954, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1272.1817636489868, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 940.1297616958618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1315.9067153930664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1377.9009437561035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 954.7270393371582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1204.5113611221313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1271.9702434539795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 931.7540836334229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1308.7649631500244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1371.9257545471191, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 949.5920038223267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1208.7875175476074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1279.6580696105957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.9225625991821, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.5030355453491, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.1164793968201, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 606.9249606132507, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.0934371948242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.1435217857361, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.5502362251282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.4315180778503, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 585.2571177482605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.6041569709778, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.6470394134521, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.0974354743958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.4431972503662, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.4862418174744, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.6593637466431, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.4078392982483, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.9287972450256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.9838366508484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.5484828948975, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 600.9659194946289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.932954788208, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.4839997291565, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.2364768981934, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.5532846450806, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.3433589935303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.0244832038879, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 567.6204776763916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.126874923706, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.3841619491577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.6316757202148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.3606395721436, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.306236743927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.8017630577087, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.1667213439941, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 775.5905604362488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 777.4523162841797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.2667217254639, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.483362197876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.90895652771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 783.9528012275696, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 647.753598690033, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.25727891922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 763.4220838546753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 788.037919998169, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 692.8118419647217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.4139165878296, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 761.2171149253845, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 785.7353639602661, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 648.7551975250244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.2155237197876, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.6702446937561, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 778.327362537384, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 688.3673620223999, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.6428823471069, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.5719995498657, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.8832001686096, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.5694408416748, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.6315197944641, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 758.9847946166992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.5702438354492, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 685.7436776161194, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.151674747467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.9441566467285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.57967710495, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.0300731658936, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1167.2147226333618, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1013.7425661087036, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 902.6828861236572, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.4001607894897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1180.5481576919556, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.2075262069703, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 901.6348791122437, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.3236880302429, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.938554763794, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1022.9980802536012, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 896.7939186096191, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 809.9787211418152, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1171.892008781433, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1028.0190324783325, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 893.1148767471313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.4886426925659, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.6440000534058, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.2859253883362, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.1676826477051, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.7723145484924, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.1627197265625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 603.645441532135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.4489598274231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.156955242157, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.5331196784973, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.1622338294983, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.8876781463623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.1713590621948, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.9614410400391, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.0238375663757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.7977585792542, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.2014403343201, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.9959983825684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 618.0420851707458, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.8334455490112, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.3577599525452, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.0859212875366, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.3745636940002, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.1040000915527, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.1937599182129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.3727989196777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.7201552391052, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 620.9391975402832, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.7889609336853, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 560.3163194656372, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 608.1294393539429, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.8649578094482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.2844815254211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.6995224952698, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.5838370323181, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 652.9513621330261, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.3766403198242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.0257635116577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 676.814079284668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.0062355995178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.0303955078125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 685.895037651062, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 673.6356806755066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.6977553367615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.1334414482117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 682.0091199874878, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 672.6371192932129, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.4497594833374, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1468.002233505249, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 825.894889831543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1499.080638885498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 821.6987228393555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1503.625283241272, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.1233558654785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1509.8982429504395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 816.5620756149292, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.7532801628113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 607.4809622764587, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 627.292640209198, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 670.4980754852295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 557.4260830879211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.1028738021851, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.7606468200684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.2907228469849, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.5968012809753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.5480027198792, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.0720009803772, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 579.5219230651855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.4859156608582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.3099131584167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.9305653572083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 693.4486413002014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.1262426376343, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.6668844223022, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.0465569496155, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.9127988815308, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.4727964401245, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 695.6990361213684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 576.1230397224426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 575.3364825248718, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 923.6857509613037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.2328023910522, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 919.8958349227905, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.1665639877319, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 926.2558460235596, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.5401582717896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 928.5524845123291, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 639.236319065094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 609.6393632888794, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.5728001594543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.1913542747498, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.3193593025208, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.6684856414795, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.3065605163574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.0428791046143, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 724.2015957832336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.1747217178345, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.2009625434875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.031834602356, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 700.4167985916138, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 622.1892762184143, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.6353578567505, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.8255987167358, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 733.3033609390259, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 610.1230430603027, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 561.8726420402527, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.6031999588013, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 702.0008015632629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 625.5527997016907, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.4808020591736, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 599.4268751144409, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 737.0497584342957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.675678730011, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.5540781021118, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.8072023391724, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 704.7966361045837, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.5872020721436, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.8929567337036, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 601.705436706543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 739.0990471839905, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 671.7598390579224, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.2548713684082, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 747.3843216896057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.6667141914368, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 832.005124092102, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 796.3936042785645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 674.6142435073853, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 799.5707249641418, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 755.0219202041626, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 772.6415944099426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 826.2671899795532, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 807.5595235824585, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 675.5057621002197, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 798.8775968551636, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 753.7739253044128, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 770.4758358001709, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 824.1686344146729, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 813.2708883285522, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 680.5820846557617, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 803.3567953109741, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 760.6950426101685, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 776.9761538505554, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 831.2022352218628, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 824.2894315719604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1034.8675298690796, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1076.834397315979, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1033.165111541748, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1080.110559463501, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1036.677598953247, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1071.342568397522, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1042.8159952163696, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1076.5801572799683, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 569.6777558326721, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.9086394309998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.0649647712708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.6707172393799, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 587.0371150970459, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.1881642341614, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.5228805541992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.407518863678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 571.1622333526611, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 544.0038371086121, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 580.4260754585266, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 574.2479968070984, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 586.6116786003113, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.41952085495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 596.3054394721985, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 590.4022407531738, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.2796859741211, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 547.0583939552307, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 582.5739192962646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 573.6763167381287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.7742366790771, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.5411267280579, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 597.3063945770264, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 591.3121604919434, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 578.4526419639587, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.7542443275452, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.4134411811829, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 577.8220868110657, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 595.557918548584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 558.8028812408447, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 598.9056015014648, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 593.7588810920715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 715.8371257781982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.4841628074646, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 796.2043261528015, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.6214365959167, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 711.9256019592285, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.8385629653931, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 795.064799785614, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 630.0252842903137, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.1275177001953, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 638.3449602127075, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 800.9657621383667, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 634.6259236335754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 719.9332809448242, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 640.4929566383362, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 805.6567907333374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 636.897759437561, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 541.1820769309998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 563.79807472229, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.8779211044312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 538.330237865448, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.0456070899963, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.7457642555237, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 540.7774424552917, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.6425533294678, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 551.3000011444092, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 539.3555235862732, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 552.4048018455505, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.995677947998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.1623992919922, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 562.880482673645, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 553.5331153869629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 542.3972868919373, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 555.8963179588318, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 548.6204743385315, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 546.6216015815735, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 566.6313576698303, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 556.3305568695068, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 545.3137636184692, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 559.923357963562, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 550.0307250022888, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.0262379646301, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 621.4380788803101, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 628.4899139404297, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 616.6489672660828, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 629.3430376052856, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 619.3649625778198, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.8425641059875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 623.3886432647705, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 624.3171191215515, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 564.084005355835, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 626.9067215919495, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 565.9126448631287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 632.4215984344482, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 568.4398412704468, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 635.7977557182312, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 572.0243263244629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 954.6132850646973, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 983.7791872024536, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.0809669494629, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.9438381195068, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 947.2068691253662, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.7796764373779, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 835.5209541320801, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.000955581665, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 954.2955207824707, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 989.6404790878296, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.1310415267944, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 853.5799932479858, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 946.4985513687134, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 964.4591951370239, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 831.9080018997192, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 841.8902492523193, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 960.1108884811401, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 996.4556884765625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.5036773681641, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 859.8441696166992, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 948.9329624176025, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 969.2694330215454, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 839.833607673645, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 844.882402420044, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 964.3443155288696, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 998.8769674301147, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 862.4568033218384, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 866.3609409332275, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.7219305038452, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 973.5979223251343, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 850.3876829147339, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 843.493595123291, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1247.4396705627441, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1196.6036748886108, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1253.0902481079102, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1203.191204071045, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1259.3569564819336, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.327359199524, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1262.1897602081299, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1210.5126333236694, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 706.1572790145874, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 689.6748733520508, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 633.9174389839172, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 686.6585659980774, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 662.3212814331055, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 604.4791984558105, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 713.0124807357788, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.5126419067383, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 641.4780807495117, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 687.6030468940735, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 665.6009602546692, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 611.299684047699, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 708.5684823989868, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 694.6448016166687, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 643.4639978408813, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 691.6072058677673, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 666.9710445404053, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 614.3824005126953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 710.6932830810547, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.6959962844849, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 642.5201606750488, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 690.6991958618164, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 667.1486377716064, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.62624168396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 924.7968101501465, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.4439988136292, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 929.9596834182739, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 800.6940770149231, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 936.9561529159546, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 803.1054401397705, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 928.0075168609619, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 801.3619208335876, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 811.538405418396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 649.9500823020935, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 735.8307242393494, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 583.3406448364258, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 812.0356750488281, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 655.1200032234192, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.8224005699158, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 584.6519947052002, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 819.1145706176758, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 654.0332770347595, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 738.5864043235779, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 589.0294361114502, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 822.8027200698853, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 661.3356828689575, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 745.9905648231506, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 588.3902406692505, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2633.9260864257812, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 612.5137639045715, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2653.2561588287354, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 615.3406429290771, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2671.504011154175, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 613.7409543991089, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2676.8460655212402, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 617.1438336372375, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1986.9647979736328, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1382.1910429000854, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1993.8891220092773, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1394.860486984253, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1995.9868907928467, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1387.7804803848267, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2005.5671977996826, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1404.0884685516357, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1998.2713508605957, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1390.7809686660767, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2006.6262435913086, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1403.6855936050415, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2000.3953552246094, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1385.6865692138672, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2011.1102676391602, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1401.5947246551514, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1879.7185611724854, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1219.4129657745361, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1878.4107398986816, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1224.373435974121, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1885.1270198822021, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1227.7771139144897, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1892.499189376831, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1234.9451208114624, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7550.028991699219, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 995.6875085830688, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7541.615829467773, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 998.9518451690674, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7552.024002075195, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1006.8326473236084, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7590.907745361328, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 256, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1004.2790222167968, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2325.2532863616943, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2296.137933731079, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2365.2686405181885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2490.0865650177, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3382.0705795288086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3328.2355308532715, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3364.321632385254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3459.93408203125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2300.016326904297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2301.934242248535, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2368.136806488037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2471.312484741211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3389.721736907959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3397.2830390930176, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3444.3581008911133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3542.7696228027344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2287.9634952545166, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2306.846227645874, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2372.6742362976074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2473.692150115967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3400.797920227051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3404.9420738220215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3436.693572998047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3542.7255821228027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2277.599334716797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2302.4955081939697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2368.753261566162, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2461.8998622894287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3392.7294158935547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3403.560676574707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3439.5680046081543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3539.375057220459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2369.559679031372, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2488.936014175415, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2749.2487812042236, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3197.5519943237305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3345.940628051758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3309.0960121154785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3521.715679168701, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4038.7622451782227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2348.0203247070312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2466.6383838653564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2696.7115211486816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3143.889112472534, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3351.6892623901367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3307.6143836975098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3513.3383560180664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4006.330547332763, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2350.494394302368, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2457.4228858947754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2722.036647796631, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3140.5150413513184, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3346.654853820801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3311.755790710449, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3511.783981323242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4017.422027587891, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2339.5408153533936, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2447.5622177124023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2712.127857208252, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3127.0948791503906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3350.947322845459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3311.8330001831055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3508.85347366333, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4016.8814468383794, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3278.179054260254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3665.1455879211426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5271.618709564209, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5403.523712158203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4104.797115325928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4187.295684814453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6038.239364624023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6186.793899536133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3291.3073348999023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3584.54158782959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5321.689910888672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5469.034729003906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4076.164455413818, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4172.8862380981445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6044.528961181641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6216.968612670898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3341.503086090088, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3582.666721343994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5337.356605529785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5500.024948120117, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4104.877471923828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4180.167636871338, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6074.777431488037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6231.737880706787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3335.456771850586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3566.8740463256836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5343.808937072754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5500.974235534668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4102.738361358643, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4181.780014038086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6076.649761199951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6225.860290527344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1962.5072002410889, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.7179107666016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2076.2822341918945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2298.2894229888916, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2183.277425765991, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2096.8153762817383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2161.693925857544, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2338.120641708374, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1856.0558605194092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1898.8311862945557, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1980.9948635101318, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2200.7787227630615, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2191.509437561035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2007.4171161651614, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2067.5657653808594, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2255.6475162506104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1842.3112106323242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1887.1766376495361, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1982.7358436584473, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2195.7697582244873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2180.8512210845947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1979.1953563690186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2053.30096244812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2263.948497772217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1829.9763298034668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1877.648983001709, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1973.911533355713, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2179.156322479248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2159.1721534729004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1967.6132678985596, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2047.0966434478762, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2262.039031982422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2076.0449600219727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2408.923215866089, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3096.556329727173, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3132.0811080932617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2250.733766555786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2355.063190460205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3062.4307250976562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3213.1310176849365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2000.9524917602537, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2306.1835193634033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2765.110397338867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2922.087516784668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2225.341272354126, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2221.8268871307373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3034.3875122070312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3171.729145050049, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2001.184320449829, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2300.3471851348877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2771.8313598632812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2931.5155124664307, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2233.65008354187, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2222.94864654541, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3042.3180961608887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3191.5511798858643, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1989.2753219604492, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2291.641607284546, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2773.565902709961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2929.502239227295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2218.038396835327, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2214.0537548065186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3035.0564861297607, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3182.324962615967, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3193.9623737335205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4151.05598449707, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2707.0990562438965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3603.6607933044434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3053.8827228546143, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4110.69278717041, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2620.7814407348633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3602.5545501708984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3066.490068435669, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4123.734569549561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2618.026714324951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3613.1824111938477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3052.3884677886963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4118.141632080078, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2604.8092937469482, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3612.070083618164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1822.4259090423584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1874.323844909668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2015.3076648712158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2319.6843242645264, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1903.288974761963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1827.4851322174072, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1902.9497337341309, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2261.23646736145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1751.5752220153809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1790.8558368682861, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1863.1598567962646, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2037.8790569305418, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1859.9424266815186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1760.6377506256104, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1853.454704284668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2193.4414291381836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1737.6414394378662, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1779.098720550537, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1850.3230381011963, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2037.8512001037595, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1853.8227272033691, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1748.0233573913574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1850.3932857513428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2187.209596633911, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1732.4563312530518, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1776.4003276824951, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.5606327056885, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2046.5270519256592, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1846.9128227233887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1739.2776107788086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1839.2604732513428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2190.531349182129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1924.9524688720703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2712.617120742798, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2740.6832122802734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2072.7961921691895, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2395.9286403656006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2441.562900543213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1862.3895931243896, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2307.3287963867188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2353.3585262298584, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1972.4908828735352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2182.807836532593, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2259.4164752960205, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1856.1369514465332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2304.610252380371, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2343.8676834106445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1969.2479801177979, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2197.0961380004883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2255.193281173706, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1850.4859161376953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2301.2273597717285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2338.0764961242676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1963.9484786987305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2194.8512077331543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2248.5825538635254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5152.507476806641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3510.769805908203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5001.980152130127, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3111.588478088379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5021.744918823242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3100.3017807006836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5033.472805023193, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3095.7464027404785, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1899.7974395751953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1989.0390491485596, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2155.0366401672363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1821.1745738983154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1757.2502517700195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1873.7697505950928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1796.9629001617432, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1787.0764827728271, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1792.626085281372, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1696.8927955627441, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1660.7660675048828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1697.2890949249268, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1783.3083248138428, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1789.6009731292725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1797.2740745544434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1672.0020771026611, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1654.3060684204102, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1677.418556213379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1793.548812866211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1792.2803211212158, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1782.6718521118164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1662.3835182189941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1659.9935913085938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1679.461441040039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2749.5820713043213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2582.4508666992188, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2474.4275283813477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2281.0542488098145, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2492.0643424987793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2294.744634628296, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2490.7241439819336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2291.5478515625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1648.95601272583, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1632.6124954223633, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1698.1339168548584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1813.8212776184082, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2105.393114089966, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1983.3755207061768, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2031.1073875427246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2109.0611267089844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1683.9328002929688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1686.5449619293213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1752.660322189331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1875.2216148376465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2051.291847229004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1966.0094547271729, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2028.983039855957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2110.2267169952393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1700.674877166748, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1689.3655967712402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1758.1743907928467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1870.62593460083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2049.725294113159, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1975.6360054016113, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2030.1319885253909, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2113.5256004333496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1703.7476921081543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1687.9036903381348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1756.5970993041992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1874.8363208770752, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2054.7152042388916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1972.1846294403076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2017.085762023926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2112.240810394287, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1884.852647781372, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1812.8759860992432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2065.0228881835938, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2825.2681636810303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2098.7470531463623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1988.223533630371, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2228.924627304077, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3039.3081378936768, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1938.3608150482178, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1848.5539150238037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2081.45263671875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2842.858896255493, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2141.501922607422, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2002.2487735748289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2205.667200088501, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3079.0238189697266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1939.9072074890137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1843.797435760498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2082.3193645477295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2848.2295989990234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2139.2094230651855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1998.5345554351807, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2220.647678375244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3078.7990283966064, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1933.6048030853271, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1843.5536003112793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2075.6279945373535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2857.696475982666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2131.0843181610107, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1994.0660572052002, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2208.0428886413574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3088.5987091064453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2494.1913509368896, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3575.256824493408, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3731.499786376953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2524.120969772339, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3368.1519889831543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3493.7064170837402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.136486053467, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3594.0499687194824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3755.416603088379, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2597.4507331848145, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3355.1088333129883, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3512.6542472839355, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2541.4427375793457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3603.0031967163086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3766.8772888183594, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2598.8531494140625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3362.179698944092, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3514.2855644226074, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.0563106536865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3607.271041870117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3767.915325164795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2596.689920425415, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3368.184814453125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3514.6094703674316, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1466.7193603515625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1315.921926498413, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1363.7137603759766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1489.0710401535034, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1563.353443145752, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1410.4839992523193, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1455.1815938949585, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1589.663519859314, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1472.6380681991577, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.1510362625122, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1373.6737632751465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1484.6588850021362, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1563.9300870895386, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1420.680809020996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1462.9547309875488, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1590.7044792175293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1474.281120300293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.7479982376099, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1372.403998374939, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1484.756326675415, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1556.3782405853271, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1420.3697633743286, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1459.2766427993774, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1596.321930885315, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1464.4580745697021, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1319.2007970809937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1369.5771169662476, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1478.9636850357056, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1551.454553604126, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1415.4937601089478, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1457.6584100723267, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1592.440013885498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1782.2732830047607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1570.7563304901123, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2008.481435775757, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2061.759834289551, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1740.7611274719238, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1565.6084775924683, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2070.2472019195557, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2120.11905670166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1801.8465518951416, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1579.83247756958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1994.480333328247, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2071.0822200775146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1755.3910636901855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1549.2913484573364, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2041.7860889434817, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2127.9401779174805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1801.5620708465576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1576.0849714279175, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.2606525421143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2084.2167949676514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1752.4904155731201, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1547.5499153137207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2045.433759689331, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2117.548942565918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1792.079210281372, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1572.1385622024536, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1981.6809558868408, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2078.2939434051514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1754.0628719329834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1538.94784450531, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2046.2993526458743, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2124.567337036133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2197.849750518799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3193.8487911224365, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2662.544479370117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2404.2737674713135, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2154.6292972564697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3192.3340702056885, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2675.840139389038, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2419.6913719177246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2160.6534671783447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3193.9822578430176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2671.11008644104, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2421.355199813843, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2164.7603130340576, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3193.477268218994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2682.7462482452393, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2421.277904510498, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1262.282075881958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1201.5086364746094, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1422.7737617492676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1465.2592086791992, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1411.0251140594482, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1211.991844177246, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1426.995348930359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1457.250075340271, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1260.540795326233, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1204.4737720489502, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1410.3353548049927, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1414.775996208191, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1445.5942392349243, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1215.5359888076782, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1445.9395265579224, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1455.5521726608276, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1259.032006263733, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1202.6862335205078, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1409.5865678787231, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1415.9948873519897, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1458.6974430084229, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1213.9878416061401, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1438.3478355407715, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1459.9865627288818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1255.8263969421387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1198.6267185211182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1411.665916442871, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1416.563835144043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1455.6579303741455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.1926383972168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1438.5614347457886, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1456.0063982009888, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1599.780478477478, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1674.1366481781006, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1665.4646492004395, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1638.871350288391, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1568.3475160598755, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1652.6535987854004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1678.5615921020508, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1625.1198530197144, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1568.5704040527344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1654.5060920715332, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1675.6305503845215, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1630.347981452942, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1558.626732826233, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1653.868808746338, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1675.4491329193115, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1637.54798412323, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3723.2805252075195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2257.303991317749, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3780.8773231506348, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2106.2124824523926, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3792.2976875305176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2102.2619342803955, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3819.5900917053223, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2096.620168685913, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1272.847843170166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1321.9567966461182, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1327.2099304199219, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1636.2148666381836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1142.5174236297607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1166.6289710998535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1239.5072078704834, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1303.6526489257812, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1302.2471952438354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1705.6990337371826, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1180.5571269989014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1189.1742515563965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1238.2996797561646, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1306.4175939559937, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1309.250078201294, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1708.7465476989746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1183.8438367843628, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1185.7729530334473, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1227.967824935913, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1300.3692817687988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1311.3686323165894, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1683.2121562957764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1183.9692735671997, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1182.3622417449951, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2217.439832687378, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1681.178903579712, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2210.387706756592, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1578.6976099014282, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2194.725122451782, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1576.905426979065, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2222.7692699432373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1566.0979223251343, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1391.3511991500854, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1326.030879020691, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1394.6120071411133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1702.0334434509277, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1469.5316743850708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1418.9839887619019, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1492.0841598510742, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1889.5489692687988, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1465.7860803604126, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1405.6204748153687, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1473.4236860275269, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1732.0811176300049, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1514.0839862823486, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1469.3316745758057, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1560.0097703933716, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1913.1121635437012, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1468.4047985076904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1407.507038116455, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1477.4684762954712, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1734.9180698394775, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1505.677604675293, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1465.8600044250488, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1551.3670349121094, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1911.4406299591064, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1472.2489643096924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1406.6676807403564, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1475.433759689331, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1738.7915229797363, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1506.9355249404907, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1463.9012813568115, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1554.9022483825684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1917.3750305175781, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1627.5609731674194, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1889.9177742004395, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1858.1926345825195, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1848.3979225158691, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2052.4164867401123, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2084.5350456237793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1703.5097694396973, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1926.6430377960205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1928.4545707702637, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1876.5331077575684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2061.3913536071777, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2096.091833114624, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1698.87375831604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1943.1470489501953, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1924.9762916564941, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1876.066541671753, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2075.983829498291, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2122.257432937622, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1705.8124732971191, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1953.1328105926514, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1918.957290649414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1878.0246257781982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2080.8945655822754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2123.6931324005127, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2629.4921493530273, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2685.147657394409, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2702.979507446289, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2819.9939155578613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2699.8854446411133, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2820.817451477051, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2721.4918422698975, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2827.228488922119, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.5131225585938, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1037.5414419174194, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1220.5158424377441, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1223.4491205215454, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1152.209768295288, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1065.4771184921265, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1388.2999992370605, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1394.9768114089966, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1084.7675275802612, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1066.4967966079712, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1241.1831998825073, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1210.3449630737305, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.6248006820679, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1088.4873628616333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1395.546236038208, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1402.5727939605713, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1084.226884841919, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1067.0102453231812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1246.3929605484009, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1215.821294784546, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1173.4830379486084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1084.2636823654175, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1388.6584043502808, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1385.0230360031128, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1081.43967628479, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1064.3951988220215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1236.0428762435913, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1219.1348791122437, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1169.184947013855, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1084.03968334198, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1391.0931253433228, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1387.564001083374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1641.020975112915, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1325.3019332885742, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1895.453462600708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1529.0574502944946, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1713.3363246917725, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1349.8348760604858, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1972.0801734924316, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1509.5516729354858, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1719.552812576294, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1342.7391862869263, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1983.0603313446045, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1520.4592037200928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1719.7990322113037, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1343.1979084014893, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1978.858060836792, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1513.9113664627075, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 924.419846534729, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1015.1006412506102, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 984.9289560317993, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 939.1942405700684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 998.3777523040771, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 955.9843111038208, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 938.6153650283813, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.9428758621216, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 978.4814262390137, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 951.4891242980957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1012.5552034378052, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 970.5814456939697, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 934.7078466415405, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1026.7471981048584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 983.0995178222656, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 949.6609592437744, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1015.2871990203857, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 974.1088008880615, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 935.1436853408813, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1021.562876701355, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 979.0430450439453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 956.3177680969238, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1009.8452806472777, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 972.2926330566406, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1321.053442955017, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1291.3400077819824, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1347.4276781082153, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1307.739839553833, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1346.5800046920776, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1306.2964820861816, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1350.1857662200928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1312.5079917907715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1218.6171197891235, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1038.818564414978, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1231.9736003875732, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1057.2744035720825, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1244.9772882461548, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1063.8995265960693, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1258.5089683532715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1064.3272066116333, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1641.5592002868652, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1737.5435066223145, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1553.2987213134766, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1575.1219129562378, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1630.8640050888062, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1689.3091297149658, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1540.603518486023, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1544.8532819747925, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1666.7198276519775, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1775.0424194335938, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1585.210394859314, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1601.134557723999, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1660.9853076934814, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1726.457748413086, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1563.943510055542, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1587.3844861984253, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1668.0526447296143, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1786.3716888427734, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1583.89967918396, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1608.2689571380615, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1663.1523132324219, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1749.2984008789062, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1564.810242652893, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1584.557285308838, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1676.6187191009521, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1791.620798110962, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1585.8390522003174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1604.8867177963257, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1668.8651371002197, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1741.0388660430908, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1550.6168031692505, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1593.7572717666626, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2199.7723293304443, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2105.3575897216797, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2271.6017532348633, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2201.5503787994385, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2278.3329582214355, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2194.853754043579, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2277.670087814331, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2203.658227920532, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1149.4248008728027, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1201.0124731063843, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1114.7900915145874, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1109.6756744384766, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1131.1153602600098, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1050.0936031341553, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1161.6267204284668, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1212.163200378418, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1123.1193590164185, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1124.148645401001, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1149.4571208953857, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1058.7790203094482, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1162.6384019851685, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1211.0390329360962, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1120.8777523040771, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1129.510407447815, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1147.9241609573364, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1063.5831928253174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1174.305911064148, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1209.2958402633667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1129.4958400726318, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1135.2355241775513, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1154.0955114364624, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1058.0985641479492, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1639.860315322876, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1420.9270286560059, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1669.1107082366943, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1458.9315223693848, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1670.8129501342773, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1453.2006549835205, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1672.5340747833252, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1462.3507118225098, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1355.4844760894775, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1018.0417585372925, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1215.4121589660645, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 895.5651235580444, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1386.5420818328857, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1022.2633600234985, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1249.7433519363403, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 901.5065574645996, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1401.9988918304443, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1030.3695964813232, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1235.3534317016602, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 898.5804748535156, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1402.422399520874, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1032.8955125808716, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1244.788475036621, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 908.5278415679932, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4880.365619659424, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1023.4084749221802, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4853.734874725342, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1020.901927947998, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4877.472496032715, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1040.196795463562, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4915.046195983887, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1046.9940662384033, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2231.3545513153076, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1594.9457597732544, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2256.235990524292, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1611.6910457611084, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2252.9142475128174, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1625.1665496826172, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2273.8785552978516, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1639.686689376831, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2256.4112091064453, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1625.8951950073242, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2277.885446548462, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1642.6534271240234, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2279.5150470733643, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1631.4622402191162, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2299.007034301758, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1662.7503967285156, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1952.05246925354, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1384.4667196273804, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1963.1043338775635, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1399.660964012146, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2000.750379562378, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1400.0662326812744, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2086.5492725372314, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1417.965440750122, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8151.043663024902, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1137.5107192993164, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8101.377296447755, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1147.5318479537964, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8126.138877868651, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1155.546555519104, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8187.997932434081, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 1024, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1172.4057626724243, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4501.772003173828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4476.681880950928, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4595.272674560547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4824.999027252197, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6491.373138427734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6451.748886108398, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6519.189758300781, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6694.749221801758, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4445.0982666015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4513.853130340576, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4607.033271789551, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4758.890552520752, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6519.580955505371, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6620.930976867676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6660.732536315918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6863.767547607422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4385.213603973389, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4488.21346282959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4589.728145599365, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4719.666290283203, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6550.609931945801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6623.67244720459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6695.572891235352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6868.200759887695, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4349.499378204346, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4481.035556793213, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4566.498260498047, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4689.734401702881, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6534.305877685547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6595.338973999023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6661.113128662109, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6858.8945388793945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4602.259044647217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4837.270107269287, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5330.09407043457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6223.501930236816, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6476.223373413086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6443.577041625977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6813.443222045898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7823.028411865234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4516.089916229248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4762.248134613037, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5140.965576171875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6062.722225189209, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6474.412040710449, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6450.106506347656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6828.327674865723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7763.375625610352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4491.726551055908, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4710.494422912598, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5138.614902496338, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6004.417095184326, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6473.797721862793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6457.386703491211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6795.108413696289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7782.135162353516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4452.567825317383, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4676.65132522583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5112.550506591797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5957.404079437256, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6474.9260330200195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6447.209930419922, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6796.417465209961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7806.230392456055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6332.819709777832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7135.026893615723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10207.952690124512, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10461.305046081543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7903.491401672363, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8124.693450927735, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11747.277526855469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12013.495712280273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6241.043319702148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6917.86865234375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10258.932838439941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10536.24095916748, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7833.250579833984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8106.434211730956, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11732.614250183105, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12013.905944824219, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6245.468158721924, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6870.0947189331055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10290.87963104248, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10575.134773254395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7885.059509277344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8121.0345458984375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11767.666091918945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12055.329704284668, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6243.568458557129, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6835.126037597656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10312.058029174805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10589.687614440918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7896.106338500977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8122.471122741698, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11756.274185180664, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12066.599807739258, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3793.312530517578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3828.1289863586426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4013.2694244384766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4440.006904602051, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4149.987201690674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4081.1844825744624, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4182.929744720459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4452.187042236328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3523.9007568359375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3603.867988586426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3802.4729537963867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4189.268283843994, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4114.420166015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3937.844982147217, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3997.6576042175293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4250.606575012207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3431.3582038879395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3543.093090057373, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3768.696460723877, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4149.169750213623, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4069.345607757568, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3769.8972702026367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3991.5707397460938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4240.783576965332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3385.4092407226562, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3507.2334480285645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3736.8077087402344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4120.274543762207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4071.1187171936035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3779.008140563965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3910.220470428467, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4233.52819442749, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3967.543830871582, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4669.752979278564, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5991.007862091064, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6032.094097137451, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4294.205303192139, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4483.299198150635, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5904.977283477783, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6238.525276184082, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3837.508964538574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4345.179653167725, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5309.876937866211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5604.533748626709, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4191.069240570068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4218.344631195068, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5845.201778411865, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6097.564868927002, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3794.8515129089355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4336.195507049561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5316.790561676025, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5619.11600112915, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4163.519535064697, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4168.171539306641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5857.406425476074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6119.179668426514, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3754.4336128234863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4299.7100830078125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5316.496448516846, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5622.731857299805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4150.717430114746, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4162.398376464844, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5858.257732391357, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6120.5659103393555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6175.119705200195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8049.147415161134, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5206.583499908447, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6975.384483337402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5865.055198669434, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7960.373420715332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4995.047397613525, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6953.708953857422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5860.794429779053, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7963.560791015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5025.825271606445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6950.38818359375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5831.2006187438965, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7974.543342590332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5018.5211753845215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6963.239974975586, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3511.988945007324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3601.803379058838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3880.9281730651855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4476.836833953857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3647.3213958740234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3526.692314147949, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3649.1643142700195, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4374.498043060303, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3328.064708709717, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3378.2118225097656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3554.215850830078, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3834.5805168151855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3489.000473022461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3372.9222297668457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3561.5489959716797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4130.829277038574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3264.5939445495605, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3322.091064453125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3520.376625061035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3820.1326751708984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3445.870590209961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3332.6084327697754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3524.8219108581543, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4128.100337982178, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3232.072582244873, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3296.494083404541, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3494.9300575256348, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3819.941749572754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3415.5188941955566, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3307.6962089538574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3500.6948471069336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4127.466907501221, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3685.0516510009766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5334.110870361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5270.118885040283, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3945.6153678894043, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4625.023670196533, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4706.414222717285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3490.0711822509766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4267.937145233154, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4407.587203979492, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3694.404468536377, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4087.0539855957027, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4233.083534240723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3445.5129432678223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4267.350101470947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4374.905776977539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3669.6294593811035, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4086.6883659362793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4229.765281677246, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3425.679397583008, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4261.112632751465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4378.818035125732, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3669.196300506592, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4090.2126121521, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4229.567489624023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9747.15045928955, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6737.411727905273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9281.757926940918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5949.799861907959, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9250.796775817871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5940.980663299561, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9325.881004333496, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5935.764141082764, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3639.725818634033, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3866.80477142334, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4113.134059906006, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3515.4472160339355, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3507.6281929016113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3597.716808319092, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3372.3005867004395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3266.4663696289062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3320.0749015808105, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3252.4467277526855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3069.560136795044, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3151.444139480591, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3343.192958831787, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3266.6601753234863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3302.898406982422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3184.32767868042, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3060.3897380828857, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3122.3334407806396, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3354.7313690185547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3264.2067527770996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3299.488925933838, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3152.651844024658, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3049.893922805786, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3127.1352100372314, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5217.597427368164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4897.487201690674, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4590.868148803711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4204.952335357666, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4624.66064453125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4243.705749511719, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4650.420455932617, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4257.638244628906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3118.718252182007, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3088.4513664245605, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3228.6662673950195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3433.58154296875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3766.613426208496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3607.0473289489746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3705.599250793457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3869.3489265441895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3205.5910301208496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3174.9499320983887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3300.84716796875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3504.136791229248, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3701.3529777526855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3630.0294303894043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3729.504623413086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3860.379066467285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3223.0859375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3172.332019805908, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3299.848003387451, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3501.541748046875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3716.8892860412598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3623.1249809265137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3718.340435028076, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3857.16495513916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3225.405445098877, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3168.0207920074463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3287.7656173706055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3495.246696472168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3722.9819297790527, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3623.827476501465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3765.1364517211914, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3869.289722442627, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3466.744804382324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3419.8892784118652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3874.754066467285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5302.344951629639, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3880.952663421631, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3643.459529876709, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4094.6449661254887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5723.1366539001465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3581.3921546936035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3455.153121948242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3870.364513397217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5342.457275390625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3942.040557861328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3695.1289558410645, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4058.352012634277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5768.589763641357, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3583.4195518493652, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3446.4673233032227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3868.288993835449, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5359.234390258789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3942.243881225586, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3693.241901397705, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4050.8732604980473, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5783.154544830322, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3594.0830039978027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3432.964973449707, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3857.1899032592773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5359.710693359375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3938.7405014038086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3692.9572677612305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4045.120162963867, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5794.701290130615, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4703.193759918213, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6730.025863647461, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7034.223175048828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4736.618900299072, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6349.532470703125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6558.7089920043945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4776.399993896484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6765.701751708984, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7063.69930267334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4804.255352020264, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6335.873928070068, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6592.266006469727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4789.127044677734, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6776.029205322266, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7080.133476257324, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4868.7005043029785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6334.608936309814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6615.4522705078125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4789.416313171387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6781.78581237793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7088.938446044922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4863.076515197754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6337.70658493042, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6600.793609619141, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2720.5924701690674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2483.000135421753, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2557.7582454681396, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2807.9740715026855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2887.930555343628, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2642.5675296783447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2729.7830390930176, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2942.9398441314697, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2714.682083129883, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2481.077461242676, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2546.5123176574707, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2770.3656101226807, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2895.2795219421387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2607.7709007263184, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2708.1854248046875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2939.1771125793457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2714.30495262146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2478.200340270996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2546.5188694000244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2772.149600982666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2881.8273735046387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2604.538402557373, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2704.7207927703857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2940.7270431518555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2694.829921722412, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2463.8759994506836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2545.979347229004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2763.0006408691406, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2867.0900535583496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2610.8263969421387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2702.8379344940186, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2941.325922012329, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3304.3449783325195, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2946.0659313201904, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3681.593132019043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3774.7881507873535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3171.1609649658203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2925.623025894165, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3788.6430168151855, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3931.0774993896484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3327.6171684265137, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2923.1019115448, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3621.9423866271973, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3753.1246376037598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3165.8862495422363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2857.4206352233887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3717.0596885681152, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3865.8922958374023, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3334.316005706787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2924.689598083496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3620.1977729797363, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3770.849952697754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3176.141757965088, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2844.6001625061035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3720.8348655700684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3876.608295440674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3328.4422492980957, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2904.8958492279053, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3614.6722984313965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3763.5817527770996, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3183.7396717071533, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2835.171184539795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3723.5833740234375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3868.010883331299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4164.863548278809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6000.172481536865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4880.343532562256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4500.551853179932, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4057.3843002319336, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5990.242042541504, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4934.727382659912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4516.934585571289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4065.2088165283208, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5999.169750213623, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4946.841106414795, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4526.129913330078, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4067.943077087403, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6001.995334625244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4967.490997314453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4524.410858154297, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2329.4815921783447, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2254.2701053619385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2658.5729789733887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2751.10577583313, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2524.0403175354004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2273.901767730713, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2633.316650390625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2693.74080657959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2323.999032974243, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2201.540126800537, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2587.860336303711, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2608.096332550049, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2547.7801513671875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2203.648633956909, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2632.353458404541, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2671.669120788574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2317.3291301727295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2192.876319885254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2584.748487472534, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2614.9190425872803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2570.39870262146, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2208.9948749542236, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2629.9943828582764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2675.008478164673, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2308.3233642578125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2189.1340732574463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2578.7911891937256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2599.204158782959, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2569.041585922241, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2208.9455795288086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2635.618886947632, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2671.4688396453857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2987.042074203491, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3056.736011505127, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3008.3004760742188, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2971.368474960327, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2881.4502239227295, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2984.9358463287354, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2952.034397125244, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2931.6843223571777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2864.007501602173, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2988.917293548584, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2983.052167892456, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2929.928960800171, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2858.8468837738037, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2982.063512802124, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2966.4201641082764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2931.808490753174, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6786.209259033203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4189.369297027588, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6801.674156188965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3874.8891258239746, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6815.809783935547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3881.0673904418945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6844.6452713012695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3891.468029022217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2265.273609161377, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2397.8164863586426, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2418.663845062256, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2885.739040374756, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2054.7451210021973, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2186.054229736328, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2173.60990524292, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2342.2713661193848, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2374.5987224578857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2956.7659187316895, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2123.346529006958, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2159.929599761963, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2144.9292850494385, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2342.829303741455, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2371.6324615478516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2988.371047973633, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2117.5566387176514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2162.1068954467773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2130.33935546875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2333.969268798828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2377.947368621826, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2985.001745223999, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2125.984516143799, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2162.61775970459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3956.224308013916, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2983.755865097046, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3867.3764419555664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2749.0939140319824, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3878.766269683838, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2768.2540893554688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3888.159713745117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2770.9875106811523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2484.5174503326416, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2374.1059017181396, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2539.62703704834, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2870.965929031372, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2581.8467235565186, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2532.2443294525146, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2655.8878326416016, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3146.6694355010986, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2662.3427200317383, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2554.0430450439453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2682.876787185669, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3022.5736045837402, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2697.899351119995, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2644.7134399414062, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2758.72145652771, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3216.949586868286, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2692.0255851745605, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2567.3808097839355, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2686.133451461792, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3010.448799133301, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2698.8494396209717, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2647.906713485718, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2774.6555137634277, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3212.5121212005615, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2699.0243339538574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2562.335367202759, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2680.5871772766113, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3009.105110168457, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2693.1356811523438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2642.436475753784, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2773.9401626586914, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3224.5347118377686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2914.9937629699707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3213.763484954834, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3195.325756072998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3100.2360248565674, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3518.490734100342, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3605.5848503112793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3126.227045059204, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3347.797565460205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3368.3764839172363, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3192.1017742156982, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3548.5290908813477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3611.4699363708496, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3145.4678440093994, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3370.285243988037, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3372.3502349853516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3200.5257606506348, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3559.3400382995605, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3610.050106048584, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3136.630268096924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3415.272846221924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3368.572940826416, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3204.909152984619, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3591.006908416748, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3615.6894493103027, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4395.183029174805, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4496.532001495361, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4660.5768394470215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5022.982711791992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4740.288944244385, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5031.897106170654, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4859.23770904541, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4996.597099304199, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1837.0198440551758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1847.7694511413574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2085.965919494629, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2163.585786819458, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1974.6622467041016, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1893.5058879852295, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2340.477924346924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2382.181463241577, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1877.7051258087158, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1887.201747894287, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2059.0100860595703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2104.0542602539062, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2019.3516826629639, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1909.5312023162842, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2363.4921550750732, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2375.039358139038, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1874.095516204834, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1888.3601570129395, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2072.476644515991, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2095.943841934204, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2019.9107265472414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1898.3524703979492, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2372.332181930542, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2383.620481491089, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1872.3390483856201, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1884.1473484039307, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2087.944812774658, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2105.9988689422607, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2013.5446262359617, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1886.812505722046, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2378.599843978882, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2380.7683277130127, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2775.089111328125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2277.700490951538, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3207.937431335449, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2549.154224395752, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2907.276153564453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2255.669937133789, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3343.5718154907227, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.282398223877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2934.0635204315186, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2262.919521331787, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3355.393753051758, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.775676727295, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2961.8897247314453, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2277.4760246276855, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3375.8998489379883, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2550.0185775756836, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1565.0550413131714, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1646.5396690368652, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1675.9718418121338, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1606.018214225769, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1627.3336029052734, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1635.5547332763672, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1586.362247467041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1650.9264183044434, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1584.6915197372437, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1628.764820098877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1653.036642074585, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1560.931043624878, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1574.4308805465698, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1652.5438404083252, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1599.7606468200684, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1625.2513694763184, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1654.4591999053955, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1568.3742380142212, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1571.705436706543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1653.3078384399414, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1605.5446290969849, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1622.8382301330566, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1659.5971393585205, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1579.017915725708, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2166.190881729126, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2123.4655952453613, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2211.9351863861084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2147.7110385894775, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2219.3915271759033, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2153.4294605255127, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2228.9200019836426, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2167.0872020721436, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1966.635971069336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1737.1729850769043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2008.9507389068601, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1778.6039733886719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2019.129123687744, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1781.6024017333984, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2029.2740917205808, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1782.5032043457031, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.0310287475586, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2666.646890640259, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2471.5545558929443, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2544.7724628448486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2532.98752784729, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2597.9281425476074, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2434.9660682678223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2513.2185554504395, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2705.142068862915, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2823.927993774414, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2657.149305343628, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2678.7892627716064, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2674.005756378174, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2743.731346130371, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2597.96462059021, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2627.9177570343018, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2704.514560699463, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2824.0627479553223, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2649.4464015960693, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2679.812641143799, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2675.7040119171143, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2755.8220958709717, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2577.6774406433105, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2647.2836875915527, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2714.5683193206787, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2838.8308811187744, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2642.5849628448486, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2716.9918537139893, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2686.0366249084473, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2765.557279586792, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2579.8083114624023, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2658.470239639282, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3431.207695007324, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3298.8564682006836, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3651.2719917297363, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3542.6519775390625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3669.569263458252, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3562.3171615600586, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3684.4561767578125, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3579.6459007263184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1751.9115161895752, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1850.3244972229004, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1764.5791816711426, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1708.96879196167, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1747.1977710723877, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1674.3115043640137, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1815.3483200073242, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1897.80366897583, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1804.3208026885986, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1775.0193786621094, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1806.7799949645996, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1726.9966316223145, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1816.215991973877, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1896.871042251587, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1812.8505611419678, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1771.8852710723877, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1807.6800060272217, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1733.6508750915527, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1821.3190460205078, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1906.015043258667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1816.7331218719482, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1778.5372734069824, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1813.5104084014893, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1739.2193603515625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2545.1257705688477, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2166.2945556640625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2622.8647994995117, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2274.1382598876953, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2633.0740547180176, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2281.5617656707764, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2651.868963241577, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2294.6596717834473, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1978.2046031951904, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1526.226725578308, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1776.9603061676025, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1344.694414138794, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2063.998727798462, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1531.3457584381104, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1844.1820621490479, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1348.5841608047485, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2066.7555046081543, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1536.5121603012085, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1852.3998069763184, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1350.2444696426392, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2092.8347206115723, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1556.6660737991333, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1864.3438339233398, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1353.5115242004395, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6773.487854003906, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1424.6358346939087, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6673.906211853027, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1453.8748836517334, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6728.685111999512, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1456.8828773498535, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6761.390686035156, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1472.1300888061523, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3791.300811767578, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2728.6046409606934, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3810.3568077087402, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2764.152822494507, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3982.0172691345215, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2759.61199760437, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4016.6202926635738, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2780.1638317108154, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3986.1649322509766, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2757.867670059204, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4030.6641769409175, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2795.4072093963623, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4012.9912948608403, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2762.097930908203, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4059.8062133789067, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2789.9479961395264, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3559.799041748047, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2370.920629501343, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3588.8417625427246, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2467.440481185913, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3609.7025299072266, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2469.7105503082275, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3722.0630645751953, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2487.5911903381348, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14228.325958251953, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1904.384651184082, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14099.525146484375, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1947.159194946289, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14190.080337524414, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1940.0942420959473, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14338.91616821289, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 2048, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 1979.7633647918701, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8921.394233703613, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8802.946014404297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9057.368621826172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9526.80866241455, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12715.99422454834, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12744.107322692871, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12912.143478393555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13213.87321472168, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8757.198257446289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8826.296920776367, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9035.49201965332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9384.94197845459, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12826.043853759766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13063.629684448242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13176.929473876953, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13547.579498291016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8562.346229553223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8751.338386535645, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8932.509841918945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9215.843048095703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12884.165267944336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13066.070175170898, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13105.992965698242, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13587.589263916016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8442.51392364502, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8688.856315612793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8834.5552444458, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9102.604522705078, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12871.039352416992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13059.170684814453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13189.483337402344, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13590.643157958984, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9061.984252929688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9576.330604553223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10495.016326904297, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12295.045127868652, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12813.764152526855, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12702.4361038208, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13477.784576416016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15426.7919921875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8857.227821350098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9389.389457702637, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10090.14087677002, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11900.856666564941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12820.61538696289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12721.660537719727, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13451.278076171875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15264.25537109375, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8689.59270477295, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9208.91586303711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9898.239784240723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11677.989959716797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12820.185546875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12756.160278320312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13455.092391967773, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15398.92219543457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8581.95785522461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9075.857429504395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9794.788780212402, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11617.53776550293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12823.20816040039, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12718.7788772583, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13457.485580444336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15387.900390625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12477.331886291504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 14081.559143066406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20124.072494506836, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20633.803253173828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15470.345306396484, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 16005.506439208986, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23119.6044921875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23613.29620361328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12175.71418762207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13572.221145629883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20156.85791015625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20669.051666259766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15368.141555786133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 16007.25715637207, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23097.970123291016, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23648.968658447266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12014.785919189453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13399.975509643555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20195.578079223633, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20713.53401184082, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15402.4267578125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15997.926177978516, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23131.79039001465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23689.181365966797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11965.028533935547, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13307.84194946289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20258.31329345703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 20784.565353393555, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15465.474090576172, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15989.4140625, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23174.753875732422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 23728.900299072266, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7546.294174194336, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7568.195343017578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7923.584327697754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8749.089050292969, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8156.69376373291, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8025.545616149902, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8213.760108947754, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8755.23151397705, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6950.532646179199, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7112.708511352539, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7482.588653564453, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8287.855911254883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7990.293960571289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7578.736877441406, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7790.513763427734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8375.339813232422, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6740.573768615723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6941.6357421875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7350.207786560059, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8133.879165649414, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7889.267845153809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7540.074844360352, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7835.670738220215, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8361.945991516113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6580.885848999023, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6851.1309814453125, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7268.830757141113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8079.184684753418, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7842.289123535156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7507.826461791992, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7775.049247741699, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8308.173217773438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7817.067337036133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9218.370704650879, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11759.391288757324, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11810.13786315918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8414.02816772461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8783.094100952148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11585.993309020996, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12300.49674987793, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7541.93473815918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8485.203742980957, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10423.292846679688, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11003.731269836426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8146.733779907227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8287.874145507812, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11492.636642456055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11990.254974365234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7391.43123626709, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8313.518524169922, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10433.409767150879, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11014.082832336426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8043.992576599122, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8194.612579345703, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11498.256568908691, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12000.086059570312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7293.100166320801, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8318.657836914062, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10443.184051513672, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11035.109596252441, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7988.163070678711, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8183.712196350097, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11509.198417663574, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12019.485397338867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12139.013366699219, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15865.266647338867, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10217.134170532227, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13726.988296508789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11457.092399597168, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15657.847671508789, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9824.922752380371, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13647.091827392578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11340.995063781738, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15649.838333129883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9744.701538085938, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13659.579238891602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11291.09058380127, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 15675.315170288086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9724.14836883545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13675.612258911133, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6971.566390991211, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7113.076438903809, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7648.276901245117, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8820.225677490234, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7229.855995178223, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6963.268165588379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7202.675132751465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8622.655181884766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6504.2303466796875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6584.717979431152, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7016.745948791504, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7520.47248840332, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6859.061660766602, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6674.165191650391, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7014.051818847656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8101.257133483888, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6339.448337554932, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6413.9667320251465, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6943.759346008301, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7514.387397766113, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6747.604789733887, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6609.2072677612305, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6952.904357910156, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8100.79490661621, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6278.879680633545, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6360.822906494141, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6900.798492431641, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7515.477447509766, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6684.498138427734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6521.629772186279, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6897.267150878906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8096.141395568849, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7281.971702575684, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10587.998847961426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10339.899940490723, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7742.278938293457, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9166.457901000977, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9231.913146972656, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6824.395790100098, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8372.547225952148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8600.730628967285, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7255.618019104004, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8035.645332336426, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8318.728866577148, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6702.249336242676, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8365.238456726074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8584.620590209961, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7159.531440734863, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8029.272994995118, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8313.734893798828, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6661.586112976074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8377.605934143066, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8609.520874023438, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7152.236480712891, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8027.976570129395, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8317.921905517578, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 19114.003982543945, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13265.303421020508, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17829.020767211914, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11673.614959716797, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17603.336029052734, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11601.312675476074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 17561.44073486328, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11599.622917175293, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7192.3652267456055, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7864.924049377441, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8102.756042480469, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6971.127395629883, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7029.443130493164, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7102.93643951416, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6697.6287841796875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6398.336429595947, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6489.144706726074, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6449.186134338379, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5987.6518630981445, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6105.659523010254, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6545.23551940918, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6390.511150360107, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6478.512191772461, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6296.991806030273, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5969.944667816162, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6084.979248046875, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6510.368194580078, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6382.5983810424805, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6471.653060913086, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6211.999187469482, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5970.493412017822, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6107.236766815186, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10194.469947814941, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9579.85164642334, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9062.497444152832, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8250.183715820312, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9135.401916503906, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8306.723518371582, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9164.434585571289, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8349.431533813477, "config": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6121.373729705811, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6090.390205383301, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6353.202228546143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6753.126373291016, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7294.752655029297, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7045.125885009766, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7218.518829345703, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7486.556282043457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6254.959506988525, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6212.639198303223, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6448.1660079956055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6837.817077636719, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7171.026039123535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7075.902557373047, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7259.60994720459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7493.586235046387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6241.973743438721, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6197.01473236084, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6431.444911956787, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6808.745765686035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7193.955230712891, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7066.846237182617, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7258.921318054199, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7492.333564758301, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6230.764503479004, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6199.041404724121, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6458.719806671143, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6806.267929077148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7212.549591064453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7148.083686828613, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7307.613067626953, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7545.159759521484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6729.412612915039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6685.7780838012695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7571.610336303711, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10270.978965759277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7515.844802856445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7138.77742767334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7922.033538818359, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11108.142204284668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6868.645668029785, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6707.935218811035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7496.704216003418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10306.874313354492, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7572.888145446777, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7205.68416595459, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7890.619659423828, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11146.778182983398, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6881.10237121582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6668.924942016602, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7475.246238708496, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10353.437767028809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7569.817924499512, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7162.161865234375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7877.142066955566, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11191.718864440918, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6920.898704528809, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6696.620178222656, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7438.426780700684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 10335.148887634277, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7605.741882324219, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7227.794189453125, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7880.672836303711, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11194.249229431152, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9092.849006652832, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13065.132369995117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13673.971862792969, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9176.814422607422, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12326.59294128418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12724.54574584961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9136.300964355469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13081.178359985352, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13677.448806762695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9162.568740844727, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12296.844215393066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12764.417724609375, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9241.964797973633, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13118.581771850586, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13715.647201538086, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9291.53938293457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12299.37858581543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12792.325477600098, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9259.955673217773, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13144.612884521484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 13740.750503540039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9291.710815429688, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12300.87875366211, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12773.201866149902, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5242.7069091796875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4901.962261199951, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5063.720798492432, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5439.4683265686035, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5579.942569732666, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5271.495342254639, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5415.397090911865, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5692.954082489014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5232.861309051514, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4865.835494995117, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5024.890384674072, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5351.528491973877, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5565.492839813232, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5200.496368408203, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5363.077774047852, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5708.000183105469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5220.245895385742, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4853.94157409668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4998.821907043457, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5347.2536277771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5551.515197753906, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5169.713611602783, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5344.7577476501465, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5707.862224578857, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5228.631858825684, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4847.395648956299, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4995.970230102539, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5356.981258392334, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5588.656024932861, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5217.306385040283, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5371.806106567383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5757.880477905273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6374.928455352783, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5721.066875457764, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7129.097137451172, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7331.862869262695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6120.778541564941, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5677.575969696045, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7334.541664123535, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7632.723159790039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6367.702560424805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5563.396320343018, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6996.033744812012, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7236.088905334473, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6037.512664794922, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5594.335498809814, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7228.446426391602, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7462.286186218262, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6387.316837310791, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5550.875873565674, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7007.556991577148, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7255.130271911621, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6111.2566566467285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5567.767181396484, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7211.662521362305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7479.581184387207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6367.520980834961, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5573.444633483887, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7009.959182739258, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7256.384353637695, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6131.035995483398, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5559.947204589844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7206.420783996582, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7470.553131103516, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8085.631370544434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11637.265968322754, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9331.630821228027, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8726.62338256836, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7806.331748962402, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11600.500411987305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9464.11075592041, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8722.527809143066, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7812.524185180664, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11622.32608795166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9484.233207702637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8741.754837036133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7826.434783935547, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 11628.552474975586, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9588.116226196289, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8744.411087036133, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4491.222190856934, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4425.335216522217, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5109.991054534912, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5280.927200317383, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4831.8500900268555, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4541.970062255859, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4993.255176544189, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5140.538215637207, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4453.301086425781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4286.255798339844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4835.577583312988, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4900.55534362793, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4821.284484863281, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4346.50016784668, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4982.926731109619, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5070.86238861084, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4441.084957122803, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4240.1971435546875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4846.314373016357, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4904.797763824463, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4864.926738739014, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4392.61100769043, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5000.529594421387, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5082.287502288818, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4434.372482299805, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4230.29691696167, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4841.675186157227, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4902.29024887085, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4846.726722717285, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4387.500782012939, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5005.563011169434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5075.220642089844, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5815.10124206543, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5965.20622253418, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5757.874736785889, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5741.0747146606445, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5578.766269683838, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5770.555324554443, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5509.973278045654, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5658.3909034729, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5547.990741729736, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5775.943870544434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5542.866897583008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5665.225315093994, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5526.2873458862305, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5775.366554260254, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5504.351215362549, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5663.755855560303, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12964.179992675781, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8015.7244873046875, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12815.894813537598, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7490.6207275390625, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12833.862991333008, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7495.19718170166, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12879.483184814453, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7530.080261230469, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4275.015678405762, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4513.440628051758, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4638.223667144775, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5394.699821472168, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4033.9766693115234, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4261.799182891846, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4064.9882888793945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4321.758270263672, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4393.400497436523, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5475.376110076904, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4015.00093460083, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4076.5660667419434, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4004.2167663574214, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4328.8886642456055, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4389.354667663574, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5572.663154602051, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3990.904312133789, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4087.2072219848637, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3955.257110595703, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4314.093475341797, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4385.994548797607, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5557.371349334717, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4011.101741790771, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4089.041652679444, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7387.913932800293, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5673.145446777344, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7068.588218688965, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5149.834060668945, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7103.826675415039, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5171.249618530273, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 7122.047653198242, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5179.616603851318, "config": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4721.483535766602, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4626.976776123047, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4942.101402282715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5558.856315612793, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4936.938877105713, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4886.602687835693, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5091.270713806152, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5883.175048828125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5072.849521636963, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4927.640323638916, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5139.209613800049, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5769.662113189697, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5161.816825866699, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5080.3424072265625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5285.80623626709, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6066.2321853637695, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5139.783153533936, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4937.02672958374, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5139.164962768555, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5752.380294799805, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5195.106887817383, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5128.765754699707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5317.273120880127, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6043.233585357666, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5137.52067565918, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5034.200325012207, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5285.714225769043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5873.941116333008, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5220.712776184082, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5420.234508514404, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5621.2993240356445, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6132.96142578125, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5686.548290252686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6098.449459075928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6154.724044799805, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5755.120105743408, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6574.627380371094, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6757.98210144043, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6009.357624053955, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6320.3631591796875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6400.383834838867, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5999.11506652832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6616.667861938477, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6775.768013000488, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6063.161392211914, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6345.622844696045, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6405.554294586182, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6012.612113952637, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6673.530235290527, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6786.916847229004, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6022.8545570373535, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6549.9711990356445, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6467.179985046387, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6040.431880950928, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6826.678466796875, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6802.9462814331055, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8211.717071533203, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8330.993385314941, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8784.50366973877, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9547.28042602539, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 8966.997108459473, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9559.884376525879, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9305.77091217041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 9317.547492980957, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3478.3670234680176, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3544.8641777038574, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3961.253433227539, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4073.9996910095215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3718.238410949707, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3601.4801597595215, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4304.349746704102, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4351.166572570801, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3526.5097427368164, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3548.5327911376953, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3757.4961853027344, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3938.906593322754, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3788.4286499023438, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3591.571846008301, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4310.843372344971, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4345.727672576904, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3523.0323219299316, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3543.544807434082, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3777.9487800598145, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3938.822555541992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3791.270046234131, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3602.6802825927734, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4314.800815582275, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4343.420448303223, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3523.5100746154785, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3520.556182861328, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3874.6030235290527, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3995.9201622009277, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3801.1135864257812, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3535.406894683838, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4327.811374664307, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4353.367824554443, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5194.7731590271, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4263.067665100098, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5988.117733001709, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4773.443756103516, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5438.2037353515625, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4118.177108764648, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6205.970993041992, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4755.063171386719, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5552.203845977783, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4125.783958435059, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6250.855541229248, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4761.8889808654785, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5633.015022277832, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4221.579170227051, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6293.305606842041, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4766.275691986084, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2974.7632026672363, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3186.078233718872, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3173.4646320343018, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3025.1212787628174, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3045.949754714966, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3118.4873485565186, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2880.361089706421, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2952.5241661071777, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2902.7505493164062, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2957.41117477417, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2950.740785598755, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2878.1841373443604, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2844.810085296631, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2960.7401275634766, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2889.6886253356934, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2936.181116104126, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2960.477924346924, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2851.6566467285156, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2838.7814331054688, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2989.2156887054443, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2934.9668979644775, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2933.474063873291, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2988.5433769226074, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2862.9895973205566, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3929.7780990600586, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3835.437545776367, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4017.5388717651367, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3886.0865592956543, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4079.34799194336, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3920.0889778137207, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4156.153869628906, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3978.694896697998, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3539.1539573669434, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3115.3745555877686, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3549.407501220703, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3143.4239864349365, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3560.8947372436523, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3156.056480407715, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3599.956169128418, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3183.416795730591, "config": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4415.585784912109, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4497.762222290039, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4350.323390960693, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4609.195499420166, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4456.599960327148, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4383.949565887451, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4244.342555999756, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4514.317760467529, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5021.20512008667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5195.3972816467285, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4943.681774139404, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4939.017105102539, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4927.8521728515625, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5021.814212799072, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4760.08638381958, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4788.812675476074, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5046.364765167236, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5197.506885528564, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4991.850051879883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5463.424320220947, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4956.982421875, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5017.5568199157715, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4799.3110275268555, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5310.731639862061, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5067.37154006958, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5207.324466705322, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5093.074531555176, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5885.851535797119, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4984.56579208374, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5019.412002563477, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4914.679298400879, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5748.238277435303, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 5}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5819.990863800049, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5617.7463722229, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6663.072319030762, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6541.342887878418, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6692.149505615234, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6444.217224121094, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6744.459991455078, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6478.198013305664, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3130.5244636535645, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3111.168165206909, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3048.563995361328, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3070.095043182373, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2946.063823699951, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3023.13871383667, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3300.2358436584473, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3238.816967010498, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3205.173749923706, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3213.7282943725586, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3135.974712371826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3085.6080055236816, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3300.726737976074, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3261.236152648926, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3342.7804565429688, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3205.916805267334, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3147.18656539917, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3214.2523288726807, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3315.5582427978516, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3296.5883255004883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3389.9593544006348, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3216.247844696045, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3172.9556941986084, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3275.124931335449, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4271.883354187012, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3635.2961921691895, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4542.604808807373, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4024.113445281983, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4598.404808044434, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4030.421142578125, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4692.101268768311, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4060.154399871826, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3375.795021057129, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2596.9678592681885, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2999.666872024536, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2274.325580596924, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3539.1600036621094, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2617.7446269989014, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3128.3199977874756, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2301.5169620513916, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3572.500991821289, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2636.0851192474365, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3148.9195251464844, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2308.2815837860107, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3645.4359817504883, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2671.486873626709, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3201.6777515411377, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2331.713285446167, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12129.61498260498, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2472.450065612793, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12021.38786315918, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2516.5416049957275, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12109.438362121582, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2543.803997039795, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 12266.20891571045, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2573.5532760620117, "config": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5870.174865722656, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4383.220615386963, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5940.574893951416, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4443.357467651367, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6369.368801116943, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4693.510112762451, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6454.839515686035, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4678.781585693359, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6425.933494567871, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4726.06876373291, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6507.122688293457, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4708.105945587158, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6576.025466918945, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4785.804138183594, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 6655.387382507324, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4762.782192230225, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 3}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5467.3956871032715, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3689.639949798584, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5539.336166381836, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3981.846866607666, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5637.489109039307, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4012.6147460937495, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 5972.575836181641, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 4119.997615814209, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 21874.033203125, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2898.939847946167, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 21101.767578125, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 2996.2156677246094, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 21193.941802978516, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3030.837278366089, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 21327.536239624023, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 2}}
-{"num_tokens": 4096, "num_experts": 72, "shard_intermediate_size": 768, "hidden_size": 4096, "topk": 10, "dtype": null, "kernel_time": 3117.4169731140137, "config": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 2}}
diff --git a/tuning_0.log b/tuning_0.log
deleted file mode 100644
index 9737a859d..000000000
--- a/tuning_0.log
+++ /dev/null
@@ -1,113327 +0,0 @@
-INFO 07-23 11:40:53 [__init__.py:235] Automatically detected platform cuda.
-Namespace(input_len=64, output_len=1, batch_size=1, n=1, use_beam_search=False, num_iters_warmup=3, num_iters=3, profile=False, output_json='/home/zrlngl/watsonx/zrl-triton-results-and-notebooks/vllm_benchmarks_latency/-net-storage149-autofs-css22-nmg-models-cos-1bfc857-fmaas-integration-tests-models-granite-4_0-small-base-pipecleaner-hf/NVIDIA_H100_80GB_HBM3/tuning_ignore/exp_2025-07-23_1140//result_bs_1_il_64_ol_1.json', disable_detokenize=False, model='/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf', task='auto', tokenizer=None, tokenizer_mode='auto', trust_remote_code=False, dtype='auto', seed=None, hf_config_path=None, allowed_local_media_path='', revision=None, code_revision=None, rope_scaling={}, rope_theta=None, tokenizer_revision=None, max_model_len=None, quantization=None, enforce_eager=False, max_seq_len_to_capture=8192, max_logprobs=20, logprobs_mode='raw_logprobs', disable_sliding_window=False, disable_cascade_attn=False, skip_tokenizer_init=False, enable_prompt_embeds=False, served_model_name=None, disable_async_output_proc=False, config_format='auto', hf_token=None, hf_overrides={}, override_neuron_config={}, override_pooler_config=None, logits_processor_pattern=None, generation_config='auto', override_generation_config={}, enable_sleep_mode=False, model_impl='auto', override_attention_dtype=None, load_format='auto', download_dir=None, model_loader_extra_config={}, ignore_patterns=None, use_tqdm_on_load=True, pt_load_map_location='cpu', guided_decoding_backend='auto', guided_decoding_disable_fallback=False, guided_decoding_disable_any_whitespace=False, guided_decoding_disable_additional_properties=False, reasoning_parser='', distributed_executor_backend=None, pipeline_parallel_size=1, tensor_parallel_size=1, data_parallel_size=1, data_parallel_rank=None, data_parallel_size_local=None, data_parallel_address=None, data_parallel_rpc_port=None, data_parallel_backend='mp', enable_expert_parallel=False, enable_eplb=False, num_redundant_experts=0, eplb_window_size=1000, eplb_step_interval=3000, eplb_log_balancedness=False, max_parallel_loading_workers=None, ray_workers_use_nsight=False, disable_custom_all_reduce=False, worker_cls='auto', worker_extension_cls='', enable_multimodal_encoder_data_parallel=False, block_size=None, gpu_memory_utilization=0.9, swap_space=4, kv_cache_dtype='auto', num_gpu_blocks_override=None, enable_prefix_caching=False, prefix_caching_hash_algo='builtin', cpu_offload_gb=0, calculate_kv_scales=False, limit_mm_per_prompt={}, media_io_kwargs={}, mm_processor_kwargs=None, disable_mm_preprocessor_cache=False, interleave_mm_strings=False, enable_lora=None, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', max_cpu_loras=None, fully_sharded_loras=False, default_mm_loras=None, enable_prompt_adapter=None, max_prompt_adapters=1, max_prompt_adapter_token=0, speculative_config=None, show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, max_num_batched_tokens=None, max_num_seqs=None, max_num_partial_prefills=1, max_long_partial_prefills=1, cuda_graph_sizes=[], long_prefill_token_threshold=0, num_lookahead_slots=0, scheduler_delay_factor=0.0, preemption_mode=None, num_scheduler_steps=1, multi_step_stream_outputs=True, scheduling_policy='fcfs', enable_chunked_prefill=None, disable_chunked_mm_input=False, scheduler_cls='vllm.core.scheduler.Scheduler', disable_hybrid_kv_cache_manager=False, async_scheduling=False, kv_transfer_config=None, kv_events_config=None, compilation_config={"level":0,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":[],"use_inductor":true,"compile_sizes":null,"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":0,"cudagraph_capture_sizes":null,"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":null,"local_cache_dir":null}, additional_config={}, disable_log_stats=False)
-ERROR 07-23 11:41:01 [config.py:133] Error retrieving safetensors: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf'. Use `repo_type` argument if needed., retrying 1 of 2
-ERROR 07-23 11:41:03 [config.py:131] Error retrieving safetensors: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf'. Use `repo_type` argument if needed.
-INFO 07-23 11:41:03 [config.py:3483] Downcasting torch.float32 to torch.bfloat16.
-INFO 07-23 11:41:03 [config.py:1602] Using max model len 132096
-WARNING 07-23 11:41:03 [arg_utils.py:1684] Detected VLLM_USE_V1=1 with Mamba. Usage should be considered experimental. Please report any issues on Github.
-INFO 07-23 11:41:03 [config.py:2424] Chunked prefill is enabled with max_num_batched_tokens=16384.
-INFO 07-23 11:41:03 [config.py:214] Setting max_seq_len_to_capture to 132096 to ensure that CUDA graph capture covers sequences of length up to max_model_len.
-[triton-dejavu] generated 75 configurations out of ConfigSpace: BLOCK_SIZE_M: [4, 8, 16, 32, 64], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 4, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _bmm_chunk_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _chunk_scan_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] restored 1 configurations for _chunk_cumsum_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _chunk_state_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _chunk_state_varlen_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] generated 168 configurations out of ConfigSpace: BLOCK_SIZE: [32, 64, 128, 256, 512, 1024, 2048, 4096], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default.
-INFO 07-23 11:41:05 [config.py:279] Setting attention block size to 528 tokens to ensure that attention page size is >= mamba page size.
-INFO 07-23 11:41:05 [config.py:300] Padding mamba page size by 0.69% to ensure that mamba page size and attention page size are exactly equal.
-WARNING 07-23 11:41:05 [__init__.py:2904] We must use the `spawn` multiprocessing start method. Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. Reason: CUDA is initialized
-INFO 07-23 11:41:09 [__init__.py:235] Automatically detected platform cuda.
-INFO 07-23 11:41:10 [core.py:553] Waiting for init message from front-end.
-INFO 07-23 11:41:10 [core.py:71] Initializing a V1 LLM engine (v0.1.dev7919+g84c7525) with config: model='/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf', speculative_config=None, tokenizer='/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=132096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":512,"local_cache_dir":null}
-[triton-dejavu] generated 75 configurations out of ConfigSpace: BLOCK_SIZE_M: [4, 8, 16, 32, 64], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 4, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _bmm_chunk_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _chunk_scan_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] restored 1 configurations for _chunk_cumsum_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-2fa507d0842a5f6a78eee941dc3c3a68f89756b47913aff39d4208afafb074fa/tune_features-604fd79069d101d891a5ad1f1f001551ff096d4dea3dc2c159faa57a9430d214/kernel_configs-86c110801e8443207d93837dc53554c59f26ccf5a1a04c352ea7e8587c82d89e/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _chunk_state_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] generated 2625 configurations out of ConfigSpace: BLOCK_SIZE_M: [16, 32, 64, 128, 256], BLOCK_SIZE_N: [16, 32, 64, 128, 256], BLOCK_SIZE_K: [16, 32, 64, 128, 256], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _chunk_state_varlen_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default.
-[triton-dejavu] generated 168 configurations out of ConfigSpace: BLOCK_SIZE: [32, 64, 128, 256, 512, 1024, 2048, 4096], maxnreg: [None], num_buffers_warp_spec: [0], num_consumer_groups: [0], num_ctas: [1], num_stages: [1, 2, 3, 4, 5, 6, 8], num_warps: [2, 4, 8], reg_dec_producer: [0], reg_inc_consumer: [0].
-[triton-dejavu] restored 0 configurations for _state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default.
-INFO 07-23 11:41:12 [parallel_state.py:1102] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
-INFO 07-23 11:41:12 [topk_topp_sampler.py:49] Using FlashInfer for top-p & top-k sampling.
-INFO 07-23 11:41:12 [gpu_model_runner.py:1793] Starting to load model /net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf...
-INFO 07-23 11:41:12 [gpu_model_runner.py:1826] Loading model from scratch...
-INFO 07-23 11:41:12 [cuda.py:246] Using FlashInfer backend on V1 engine.
-INFO 07-23 11:41:24 [default_loader.py:262] Loading weights took 11.59 seconds
-INFO 07-23 11:41:24 [gpu_model_runner.py:1850] Model loading took 60.0260 GiB and 11.687507 seconds
-INFO 07-23 11:41:27 [backends.py:530] Using cache directory: /home/zrlngl/.cache/vllm/torch_compile_cache/9bcd1b9f98/rank_0_0/backbone for vLLM's torch.compile
-INFO 07-23 11:41:27 [backends.py:541] Dynamo bytecode transform time: 2.60 s
-INFO 07-23 11:41:29 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 1.788 s
-INFO 07-23 11:41:29 [fused_moe.py:688] Using configuration from /home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/model_executor/layers/fused_moe/configs/E=72,N=768,device_name=NVIDIA_H100_80GB_HBM3.json for MoE layer.
-INFO 07-23 11:41:29 [monitor.py:34] torch.compile takes 2.60 s in total
-INFO 07-23 11:41:30 [gpu_worker.py:245] Available KV cache memory: 8.99 GiB
-INFO 07-23 11:41:31 [kv_cache_utils.py:997] GPU KV cache size: 58,608 tokens
-INFO 07-23 11:41:31 [kv_cache_utils.py:1001] Maximum concurrency for 132,096 tokens per request: 4.29x
-INFO 07-23 11:41:51 [gpu_model_runner.py:2395] Graph capturing finished in 20 secs, took 0.93 GiB
-INFO 07-23 11:41:51 [core.py:193] init engine (profile, create kv cache, warmup model) took 26.97 seconds
-INFO 07-23 11:41:51 [config.py:214] Setting max_seq_len_to_capture to 132096 to ensure that CUDA graph capture covers sequences of length up to max_model_len.
-SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=True, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None)
-Warming up...
-[triton-dejavu] ('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32') not in cache, starting to tune...
-[triton-dejavu] [2025-07-23 11:41:51]  Started benchmarking of 2625 configurations... (use_bo: False, run: 0)
-[triton-dejavu] First execution including JIT compilation took 0.003792285919189453s.
-[triton-dejavu] First execution including JIT compilation took 0.0024290084838867188s.
-[triton-dejavu] First execution including JIT compilation took 0.002718687057495117s.
-[triton-dejavu] First execution including JIT compilation took 0.0024924278259277344s.
-[triton-dejavu] First execution including JIT compilation took 0.002490520477294922s.
-[triton-dejavu] First execution including JIT compilation took 0.002441883087158203s.
-[triton-dejavu] First execution including JIT compilation took 0.00249481201171875s.
-[triton-dejavu] First execution including JIT compilation took 0.0025339126586914062s.
-[triton-dejavu] First execution including JIT compilation took 0.0025014877319335938s.
-[triton-dejavu] First execution including JIT compilation took 0.0037887096405029297s.
-[triton-dejavu] First execution including JIT compilation took 0.0025954246520996094s.
-[triton-dejavu] First execution including JIT compilation took 0.026740074157714844s.
-[triton-dejavu] First execution including JIT compilation took 0.0025556087493896484s.
-[triton-dejavu] First execution including JIT compilation took 0.002724885940551758s.
-[triton-dejavu] First execution including JIT compilation took 0.002470731735229492s.
-[triton-dejavu] First execution including JIT compilation took 0.0026400089263916016s.
-[triton-dejavu] First execution including JIT compilation took 0.00249481201171875s.
-[triton-dejavu] First execution including JIT compilation took 0.0030193328857421875s.
-[triton-dejavu] First execution including JIT compilation took 0.0026063919067382812s.
-[triton-dejavu] First execution including JIT compilation took 0.002532482147216797s.
-[triton-dejavu] First execution including JIT compilation took 0.0024771690368652344s.
-[triton-dejavu] First execution including JIT compilation took 0.0025453567504882812s.
-[triton-dejavu] First execution including JIT compilation took 0.0025365352630615234s.
-[triton-dejavu] First execution including JIT compilation took 0.002547740936279297s.
-[triton-dejavu] First execution including JIT compilation took 0.0024492740631103516s.
-[triton-dejavu] First execution including JIT compilation took 0.002447366714477539s.
-[triton-dejavu] First execution including JIT compilation took 0.002489328384399414s.
-[triton-dejavu] First execution including JIT compilation took 0.0026972293853759766s.
-[triton-dejavu] First execution including JIT compilation took 0.002499103546142578s.
-[triton-dejavu] First execution including JIT compilation took 0.002460479736328125s.
-[triton-dejavu] First execution including JIT compilation took 0.0026140213012695312s.
-[triton-dejavu] First execution including JIT compilation took 0.0025260448455810547s.
-[triton-dejavu] First execution including JIT compilation took 0.002526998519897461s.
-[triton-dejavu] First execution including JIT compilation took 0.002600431442260742s.
-[triton-dejavu] First execution including JIT compilation took 0.002473115921020508s.
-[triton-dejavu] First execution including JIT compilation took 0.0025148391723632812s.
-[triton-dejavu] First execution including JIT compilation took 0.002527475357055664s.
-[triton-dejavu] First execution including JIT compilation took 0.0025048255920410156s.
-[triton-dejavu] First execution including JIT compilation took 0.0025298595428466797s.
-[triton-dejavu] First execution including JIT compilation took 0.0027625560760498047s.
-[triton-dejavu] First execution including JIT compilation took 0.0024726390838623047s.
-[triton-dejavu] First execution including JIT compilation took 0.02015542984008789s.
-[triton-dejavu] First execution including JIT compilation took 0.0024602413177490234s.
-[triton-dejavu] First execution including JIT compilation took 0.00244903564453125s.
-[triton-dejavu] First execution including JIT compilation took 0.0025997161865234375s.
-[triton-dejavu] First execution including JIT compilation took 0.002533435821533203s.
-[triton-dejavu] First execution including JIT compilation took 0.0026590824127197266s.
-[triton-dejavu] First execution including JIT compilation took 0.002520322799682617s.
-[triton-dejavu] First execution including JIT compilation took 0.0027184486389160156s.
-[triton-dejavu] First execution including JIT compilation took 0.002468109130859375s.
-[triton-dejavu] First execution including JIT compilation took 0.002460479736328125s.
-[triton-dejavu] First execution including JIT compilation took 0.002536773681640625s.
-[triton-dejavu] First execution including JIT compilation took 0.0024209022521972656s.
-[triton-dejavu] First execution including JIT compilation took 0.0024750232696533203s.
-[triton-dejavu] First execution including JIT compilation took 0.002548694610595703s.
-[triton-dejavu] First execution including JIT compilation took 0.002722024917602539s.
-[triton-dejavu] First execution including JIT compilation took 0.0025467872619628906s.
-[triton-dejavu] First execution including JIT compilation took 0.0026938915252685547s.
-[triton-dejavu] First execution including JIT compilation took 0.002567768096923828s.
-[triton-dejavu] First execution including JIT compilation took 0.0025353431701660156s.
-[triton-dejavu] First execution including JIT compilation took 0.36343860626220703s.
-[triton-dejavu] First execution including JIT compilation took 0.2895364761352539s.
-[triton-dejavu] First execution including JIT compilation took 0.28336596488952637s.
-[triton-dejavu] First execution including JIT compilation took 0.24683427810668945s.
-[triton-dejavu] First execution including JIT compilation took 0.23354220390319824s.
-[triton-dejavu] First execution including JIT compilation took 0.2026069164276123s.
-[triton-dejavu] First execution including JIT compilation took 0.361889123916626s.
-[triton-dejavu] First execution including JIT compilation took 0.32724452018737793s.
-[triton-dejavu] First execution including JIT compilation took 0.2289412021636963s.
-[triton-dejavu] First execution including JIT compilation took 0.3778233528137207s.
-[triton-dejavu] First execution including JIT compilation took 0.35303163528442383s.
-[triton-dejavu] First execution including JIT compilation took 0.25978708267211914s.
-[triton-dejavu] First execution including JIT compilation took 0.4000725746154785s.
-[triton-dejavu] First execution including JIT compilation took 0.37045931816101074s.
-[triton-dejavu] First execution including JIT compilation took 0.2568087577819824s.
-[triton-dejavu] First execution including JIT compilation took 0.4205307960510254s.
-[triton-dejavu] First execution including JIT compilation took 0.3958923816680908s.
-[triton-dejavu] First execution including JIT compilation took 0.27231621742248535s.
-[triton-dejavu] First execution including JIT compilation took 0.4481041431427002s.
-[triton-dejavu] First execution including JIT compilation took 0.34272170066833496s.
-[triton-dejavu] First execution including JIT compilation took 0.23339176177978516s.
-[triton-dejavu] First execution including JIT compilation took 0.39439821243286133s.
-[triton-dejavu] First execution including JIT compilation took 0.3556709289550781s.
-[triton-dejavu] First execution including JIT compilation took 0.2575538158416748s.
-[triton-dejavu] First execution including JIT compilation took 0.3856210708618164s.
-[triton-dejavu] First execution including JIT compilation took 0.29673099517822266s.
-[triton-dejavu] First execution including JIT compilation took 0.2755117416381836s.
-[triton-dejavu] First execution including JIT compilation took 0.5427765846252441s.
-[triton-dejavu] First execution including JIT compilation took 0.4995570182800293s.
-[triton-dejavu] First execution including JIT compilation took 0.4783041477203369s.
-[triton-dejavu] First execution including JIT compilation took 0.631375789642334s.
-[triton-dejavu] First execution including JIT compilation took 0.5344967842102051s.
-[triton-dejavu] First execution including JIT compilation took 0.4600377082824707s.
-[triton-dejavu] First execution including JIT compilation took 0.4879109859466553s.
-[triton-dejavu] First execution including JIT compilation took 0.47121238708496094s.
-[triton-dejavu] First execution including JIT compilation took 0.41219019889831543s.
-[triton-dejavu] First execution including JIT compilation took 0.6414506435394287s.
-[triton-dejavu] First execution including JIT compilation took 0.5922412872314453s.
-[triton-dejavu] First execution including JIT compilation took 0.5718593597412109s.
-[triton-dejavu] First execution including JIT compilation took 0.7388913631439209s.
-[triton-dejavu] First execution including JIT compilation took 0.6281144618988037s.
-[triton-dejavu] First execution including JIT compilation took 0.5711205005645752s.
-[triton-dejavu] First execution including JIT compilation took 0.9001131057739258s.
-[triton-dejavu] First execution including JIT compilation took 0.681952953338623s.
-[triton-dejavu] First execution including JIT compilation took 0.628960132598877s.
-[triton-dejavu] First execution including JIT compilation took 0.19681072235107422s.
-[triton-dejavu] First execution including JIT compilation took 0.1845228672027588s.
-[triton-dejavu] First execution including JIT compilation took 0.18219757080078125s.
-[triton-dejavu] First execution including JIT compilation took 0.218735933303833s.
-[triton-dejavu] First execution including JIT compilation took 0.20905685424804688s.
-[triton-dejavu] First execution including JIT compilation took 0.21239614486694336s.
-[triton-dejavu] First execution including JIT compilation took 0.2337355613708496s.
-[triton-dejavu] First execution including JIT compilation took 0.22266483306884766s.
-[triton-dejavu] First execution including JIT compilation took 0.21049857139587402s.
-[triton-dejavu] First execution including JIT compilation took 0.2469940185546875s.
-[triton-dejavu] First execution including JIT compilation took 0.24105095863342285s.
-[triton-dejavu] First execution including JIT compilation took 0.22619390487670898s.
-[triton-dejavu] First execution including JIT compilation took 0.25737953186035156s.
-[triton-dejavu] First execution including JIT compilation took 0.24932122230529785s.
-[triton-dejavu] First execution including JIT compilation took 0.2292931079864502s.
-[triton-dejavu] First execution including JIT compilation took 0.2662630081176758s.
-[triton-dejavu] First execution including JIT compilation took 0.25505638122558594s.
-[triton-dejavu] First execution including JIT compilation took 0.23747634887695312s.
-[triton-dejavu] First execution including JIT compilation took 0.2888965606689453s.
-[triton-dejavu] First execution including JIT compilation took 0.27660059928894043s.
-[triton-dejavu] First execution including JIT compilation took 0.2541189193725586s.
-[triton-dejavu] First execution including JIT compilation took 0.21480035781860352s.
-[triton-dejavu] First execution including JIT compilation took 0.1914529800415039s.
-[triton-dejavu] First execution including JIT compilation took 0.18795394897460938s.
-[triton-dejavu] First execution including JIT compilation took 0.24895811080932617s.
-[triton-dejavu] First execution including JIT compilation took 0.216827392578125s.
-[triton-dejavu] First execution including JIT compilation took 0.21476054191589355s.
-[triton-dejavu] First execution including JIT compilation took 0.26205873489379883s.
-[triton-dejavu] First execution including JIT compilation took 0.23574447631835938s.
-[triton-dejavu] First execution including JIT compilation took 0.22771525382995605s.
-[triton-dejavu] First execution including JIT compilation took 0.2744171619415283s.
-[triton-dejavu] First execution including JIT compilation took 0.24593615531921387s.
-[triton-dejavu] First execution including JIT compilation took 0.25572872161865234s.
-[triton-dejavu] First execution including JIT compilation took 0.2906627655029297s.
-[triton-dejavu] First execution including JIT compilation took 0.2578747272491455s.
-[triton-dejavu] First execution including JIT compilation took 0.2551157474517822s.
-[triton-dejavu] First execution including JIT compilation took 0.307572603225708s.
-[triton-dejavu] First execution including JIT compilation took 0.2733023166656494s.
-[triton-dejavu] First execution including JIT compilation took 0.2657465934753418s.
-[triton-dejavu] First execution including JIT compilation took 0.34859251976013184s.
-[triton-dejavu] First execution including JIT compilation took 0.2858898639678955s.
-[triton-dejavu] First execution including JIT compilation took 0.2817537784576416s.
-[triton-dejavu] First execution including JIT compilation took 0.24785876274108887s.
-[triton-dejavu] First execution including JIT compilation took 0.214003324508667s.
-[triton-dejavu] First execution including JIT compilation took 0.20161771774291992s.
-[triton-dejavu] First execution including JIT compilation took 0.30726170539855957s.
-[triton-dejavu] First execution including JIT compilation took 0.2544825077056885s.
-[triton-dejavu] First execution including JIT compilation took 0.22570061683654785s.
-[triton-dejavu] First execution including JIT compilation took 0.3320579528808594s.
-[triton-dejavu] First execution including JIT compilation took 0.2685830593109131s.
-[triton-dejavu] First execution including JIT compilation took 0.23553252220153809s.
-[triton-dejavu] First execution including JIT compilation took 0.34238600730895996s.
-[triton-dejavu] First execution including JIT compilation took 0.2860074043273926s.
-[triton-dejavu] First execution including JIT compilation took 0.24680185317993164s.
-[triton-dejavu] First execution including JIT compilation took 0.3659553527832031s.
-[triton-dejavu] First execution including JIT compilation took 0.2950880527496338s.
-[triton-dejavu] First execution including JIT compilation took 0.2600231170654297s.
-[triton-dejavu] First execution including JIT compilation took 0.38948678970336914s.
-[triton-dejavu] First execution including JIT compilation took 0.3203599452972412s.
-[triton-dejavu] First execution including JIT compilation took 0.2689199447631836s.
-[triton-dejavu] First execution including JIT compilation took 0.42819809913635254s.
-[triton-dejavu] First execution including JIT compilation took 0.3495504856109619s.
-[triton-dejavu] First execution including JIT compilation took 0.2916533946990967s.
-[triton-dejavu] First execution including JIT compilation took 0.3085203170776367s.
-[triton-dejavu] First execution including JIT compilation took 0.26044535636901855s.
-[triton-dejavu] First execution including JIT compilation took 0.24263620376586914s.
-[triton-dejavu] First execution including JIT compilation took 0.4157595634460449s.
-[triton-dejavu] First execution including JIT compilation took 0.358691930770874s.
-[triton-dejavu] First execution including JIT compilation took 0.2635207176208496s.
-[triton-dejavu] First execution including JIT compilation took 0.4522533416748047s.
-[triton-dejavu] First execution including JIT compilation took 0.39212489128112793s.
-[triton-dejavu] First execution including JIT compilation took 0.27906370162963867s.
-[triton-dejavu] First execution including JIT compilation took 0.5023193359375s.
-[triton-dejavu] First execution including JIT compilation took 0.4139993190765381s.
-[triton-dejavu] First execution including JIT compilation took 0.300579309463501s.
-[triton-dejavu] First execution including JIT compilation took 0.529712438583374s.
-[triton-dejavu] First execution including JIT compilation took 0.43097352981567383s.
-[triton-dejavu] First execution including JIT compilation took 0.3235909938812256s.
-[triton-dejavu] First execution including JIT compilation took 0.5673091411590576s.
-[triton-dejavu] First execution including JIT compilation took 0.4577775001525879s.
-[triton-dejavu] First execution including JIT compilation took 0.34275031089782715s.
-[triton-dejavu] First execution including JIT compilation took 0.6338176727294922s.
-[triton-dejavu] First execution including JIT compilation took 0.49797868728637695s.
-[triton-dejavu] First execution including JIT compilation took 0.37319207191467285s.
-[triton-dejavu] First execution including JIT compilation took 0.47208404541015625s.
-[triton-dejavu] First execution including JIT compilation took 0.33609509468078613s.
-[triton-dejavu] First execution including JIT compilation took 0.30238795280456543s.
-[triton-dejavu] First execution including JIT compilation took 0.7173871994018555s.
-[triton-dejavu] First execution including JIT compilation took 0.5442206859588623s.
-[triton-dejavu] First execution including JIT compilation took 0.511505126953125s.
-[triton-dejavu] First execution including JIT compilation took 0.8165128231048584s.
-[triton-dejavu] First execution including JIT compilation took 0.583181619644165s.
-[triton-dejavu] First execution including JIT compilation took 0.5342910289764404s.
-[triton-dejavu] First execution including JIT compilation took 0.9290053844451904s.
-[triton-dejavu] First execution including JIT compilation took 0.6662936210632324s.
-[triton-dejavu] First execution including JIT compilation took 0.5730347633361816s.
-[triton-dejavu] First execution including JIT compilation took 0.9860448837280273s.
-[triton-dejavu] First execution including JIT compilation took 0.7075350284576416s.
-[triton-dejavu] First execution including JIT compilation took 0.593177080154419s.
-[triton-dejavu] First execution including JIT compilation took 1.0206115245819092s.
-[triton-dejavu] First execution including JIT compilation took 0.6365783214569092s.
-[triton-dejavu] First execution including JIT compilation took 0.5057172775268555s.
-[triton-dejavu] First execution including JIT compilation took 0.9307384490966797s.
-[triton-dejavu] First execution including JIT compilation took 0.6267914772033691s.
-[triton-dejavu] First execution including JIT compilation took 0.5471899509429932s.
-[triton-dejavu] First execution including JIT compilation took 0.1969449520111084s.
-[triton-dejavu] First execution including JIT compilation took 0.1712944507598877s.
-[triton-dejavu] First execution including JIT compilation took 0.1570436954498291s.
-[triton-dejavu] First execution including JIT compilation took 0.2668495178222656s.
-[triton-dejavu] First execution including JIT compilation took 0.2014913558959961s.
-[triton-dejavu] First execution including JIT compilation took 0.17658185958862305s.
-[triton-dejavu] First execution including JIT compilation took 0.24475407600402832s.
-[triton-dejavu] First execution including JIT compilation took 0.20715713500976562s.
-[triton-dejavu] First execution including JIT compilation took 0.18568778038024902s.
-[triton-dejavu] First execution including JIT compilation took 0.2623903751373291s.
-[triton-dejavu] First execution including JIT compilation took 0.20713019371032715s.
-[triton-dejavu] First execution including JIT compilation took 0.17886114120483398s.
-[triton-dejavu] First execution including JIT compilation took 0.2510707378387451s.
-[triton-dejavu] First execution including JIT compilation took 0.21624040603637695s.
-[triton-dejavu] First execution including JIT compilation took 0.20012712478637695s.
-[triton-dejavu] First execution including JIT compilation took 0.2688755989074707s.
-[triton-dejavu] First execution including JIT compilation took 0.2036607265472412s.
-[triton-dejavu] First execution including JIT compilation took 0.21555137634277344s.
-[triton-dejavu] First execution including JIT compilation took 0.272658109664917s.
-[triton-dejavu] First execution including JIT compilation took 0.2918975353240967s.
-[triton-dejavu] First execution including JIT compilation took 0.22692298889160156s.
-[triton-dejavu] First execution including JIT compilation took 0.20147228240966797s.
-[triton-dejavu] First execution including JIT compilation took 0.17040586471557617s.
-[triton-dejavu] First execution including JIT compilation took 0.1717395782470703s.
-[triton-dejavu] First execution including JIT compilation took 0.2556333541870117s.
-[triton-dejavu] First execution including JIT compilation took 0.19439339637756348s.
-[triton-dejavu] First execution including JIT compilation took 0.18878650665283203s.
-[triton-dejavu] First execution including JIT compilation took 0.28153157234191895s.
-[triton-dejavu] First execution including JIT compilation took 0.22823047637939453s.
-[triton-dejavu] First execution including JIT compilation took 0.20015215873718262s.
-[triton-dejavu] First execution including JIT compilation took 0.2893240451812744s.
-[triton-dejavu] First execution including JIT compilation took 0.2234363555908203s.
-[triton-dejavu] First execution including JIT compilation took 0.20252442359924316s.
-[triton-dejavu] First execution including JIT compilation took 0.29529833793640137s.
-[triton-dejavu] First execution including JIT compilation took 0.25741052627563477s.
-[triton-dejavu] First execution including JIT compilation took 0.22293853759765625s.
-[triton-dejavu] First execution including JIT compilation took 0.32663512229919434s.
-[triton-dejavu] First execution including JIT compilation took 0.257922887802124s.
-[triton-dejavu] First execution including JIT compilation took 0.2501180171966553s.
-[triton-dejavu] First execution including JIT compilation took 0.3506193161010742s.
-[triton-dejavu] First execution including JIT compilation took 0.272749662399292s.
-[triton-dejavu] First execution including JIT compilation took 0.243269681930542s.
-[triton-dejavu] First execution including JIT compilation took 0.2345433235168457s.
-[triton-dejavu] First execution including JIT compilation took 0.21488571166992188s.
-[triton-dejavu] First execution including JIT compilation took 0.18851923942565918s.
-[triton-dejavu] First execution including JIT compilation took 0.29990649223327637s.
-[triton-dejavu] First execution including JIT compilation took 0.2599034309387207s.
-[triton-dejavu] First execution including JIT compilation took 0.21689176559448242s.
-[triton-dejavu] First execution including JIT compilation took 0.361560583114624s.
-[triton-dejavu] First execution including JIT compilation took 0.27080440521240234s.
-[triton-dejavu] First execution including JIT compilation took 0.22725224494934082s.
-[triton-dejavu] First execution including JIT compilation took 0.34572768211364746s.
-[triton-dejavu] First execution including JIT compilation took 0.2681708335876465s.
-[triton-dejavu] First execution including JIT compilation took 0.22074484825134277s.
-[triton-dejavu] First execution including JIT compilation took 0.3579220771789551s.
-[triton-dejavu] First execution including JIT compilation took 0.2913625240325928s.
-[triton-dejavu] First execution including JIT compilation took 0.27397990226745605s.
-[triton-dejavu] First execution including JIT compilation took 0.36322855949401855s.
-[triton-dejavu] First execution including JIT compilation took 0.36347508430480957s.
-[triton-dejavu] First execution including JIT compilation took 0.2753303050994873s.
-[triton-dejavu] First execution including JIT compilation took 0.4066603183746338s.
-[triton-dejavu] First execution including JIT compilation took 0.4136660099029541s.
-[triton-dejavu] First execution including JIT compilation took 0.29329895973205566s.
-[triton-dejavu] First execution including JIT compilation took 0.37958860397338867s.
-[triton-dejavu] First execution including JIT compilation took 0.24896860122680664s.
-[triton-dejavu] First execution including JIT compilation took 0.21965575218200684s.
-[triton-dejavu] First execution including JIT compilation took 0.4879426956176758s.
-[triton-dejavu] First execution including JIT compilation took 0.33871960639953613s.
-[triton-dejavu] First execution including JIT compilation took 0.24471020698547363s.
-[triton-dejavu] First execution including JIT compilation took 0.4965670108795166s.
-[triton-dejavu] First execution including JIT compilation took 0.40749454498291016s.
-[triton-dejavu] First execution including JIT compilation took 0.2844102382659912s.
-[triton-dejavu] First execution including JIT compilation took 0.6162877082824707s.
-[triton-dejavu] First execution including JIT compilation took 0.37363123893737793s.
-[triton-dejavu] First execution including JIT compilation took 0.30038881301879883s.
-[triton-dejavu] First execution including JIT compilation took 0.6782312393188477s.
-[triton-dejavu] First execution including JIT compilation took 0.395599365234375s.
-[triton-dejavu] First execution including JIT compilation took 0.31715917587280273s.
-[triton-dejavu] First execution including JIT compilation took 0.6199126243591309s.
-[triton-dejavu] First execution including JIT compilation took 0.4322071075439453s.
-[triton-dejavu] First execution including JIT compilation took 0.3455088138580322s.
-[triton-dejavu] First execution including JIT compilation took 0.8280949592590332s.
-[triton-dejavu] First execution including JIT compilation took 0.5218696594238281s.
-[triton-dejavu] First execution including JIT compilation took 0.36759161949157715s.
-[triton-dejavu] First execution including JIT compilation took 0.601407527923584s.
-[triton-dejavu] First execution including JIT compilation took 0.36752843856811523s.
-[triton-dejavu] First execution including JIT compilation took 0.273007869720459s.
-[triton-dejavu] First execution including JIT compilation took 0.8815312385559082s.
-[triton-dejavu] First execution including JIT compilation took 0.5408468246459961s.
-[triton-dejavu] First execution including JIT compilation took 0.4321472644805908s.
-[triton-dejavu] First execution including JIT compilation took 1.340597152709961s.
-[triton-dejavu] First execution including JIT compilation took 0.6468391418457031s.
-[triton-dejavu] First execution including JIT compilation took 0.4674386978149414s.
-[triton-dejavu] First execution including JIT compilation took 1.4745817184448242s.
-[triton-dejavu] First execution including JIT compilation took 0.7319414615631104s.
-[triton-dejavu] First execution including JIT compilation took 0.4820535182952881s.
-[triton-dejavu] First execution including JIT compilation took 1.6843111515045166s.
-[triton-dejavu] First execution including JIT compilation took 0.7272007465362549s.
-[triton-dejavu] First execution including JIT compilation took 0.5684032440185547s.
-[triton-dejavu] First execution including JIT compilation took 1.6687507629394531s.
-[triton-dejavu] First execution including JIT compilation took 0.7634897232055664s.
-[triton-dejavu] First execution including JIT compilation took 0.5958552360534668s.
-bench_cudagraph failed with out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.21318602561950684s.
-[triton-dejavu] First execution including JIT compilation took 0.21699094772338867s.
-[triton-dejavu] First execution including JIT compilation took 0.18067646026611328s.
-[triton-dejavu] First execution including JIT compilation took 0.2883434295654297s.
-[triton-dejavu] First execution including JIT compilation took 0.20682692527770996s.
-[triton-dejavu] First execution including JIT compilation took 0.19046735763549805s.
-[triton-dejavu] First execution including JIT compilation took 0.2932305335998535s.
-[triton-dejavu] First execution including JIT compilation took 0.21595001220703125s.
-[triton-dejavu] First execution including JIT compilation took 0.18168210983276367s.
-[triton-dejavu] First execution including JIT compilation took 0.32982873916625977s.
-[triton-dejavu] First execution including JIT compilation took 0.22777533531188965s.
-[triton-dejavu] First execution including JIT compilation took 0.2037661075592041s.
-[triton-dejavu] First execution including JIT compilation took 0.31747984886169434s.
-[triton-dejavu] First execution including JIT compilation took 0.28336167335510254s.
-[triton-dejavu] First execution including JIT compilation took 0.19774699211120605s.
-[triton-dejavu] First execution including JIT compilation took 0.3306758403778076s.
-[triton-dejavu] First execution including JIT compilation took 0.2724292278289795s.
-[triton-dejavu] First execution including JIT compilation took 0.22767090797424316s.
-[triton-dejavu] First execution including JIT compilation took 0.3717081546783447s.
-[triton-dejavu] First execution including JIT compilation took 0.2847135066986084s.
-[triton-dejavu] First execution including JIT compilation took 0.2544288635253906s.
-[triton-dejavu] First execution including JIT compilation took 0.2563972473144531s.
-[triton-dejavu] First execution including JIT compilation took 0.21262860298156738s.
-[triton-dejavu] First execution including JIT compilation took 0.2203054428100586s.
-[triton-dejavu] First execution including JIT compilation took 0.3555338382720947s.
-[triton-dejavu] First execution including JIT compilation took 0.25258374214172363s.
-[triton-dejavu] First execution including JIT compilation took 0.22145795822143555s.
-[triton-dejavu] First execution including JIT compilation took 0.39704275131225586s.
-[triton-dejavu] First execution including JIT compilation took 0.26523470878601074s.
-[triton-dejavu] First execution including JIT compilation took 0.21595096588134766s.
-[triton-dejavu] First execution including JIT compilation took 0.4347100257873535s.
-[triton-dejavu] First execution including JIT compilation took 0.29169178009033203s.
-[triton-dejavu] First execution including JIT compilation took 0.21956300735473633s.
-[triton-dejavu] First execution including JIT compilation took 0.4330458641052246s.
-[triton-dejavu] First execution including JIT compilation took 0.31913185119628906s.
-[triton-dejavu] First execution including JIT compilation took 0.2509474754333496s.
-[triton-dejavu] First execution including JIT compilation took 0.48702025413513184s.
-[triton-dejavu] First execution including JIT compilation took 0.32025718688964844s.
-[triton-dejavu] First execution including JIT compilation took 0.2625458240509033s.
-[triton-dejavu] First execution including JIT compilation took 0.551466703414917s.
-[triton-dejavu] First execution including JIT compilation took 0.37408924102783203s.
-[triton-dejavu] First execution including JIT compilation took 0.27136731147766113s.
-[triton-dejavu] First execution including JIT compilation took 0.33800768852233887s.
-[triton-dejavu] First execution including JIT compilation took 0.26560330390930176s.
-[triton-dejavu] First execution including JIT compilation took 0.20183563232421875s.
-[triton-dejavu] First execution including JIT compilation took 0.40157294273376465s.
-[triton-dejavu] First execution including JIT compilation took 0.3323667049407959s.
-[triton-dejavu] First execution including JIT compilation took 0.2476518154144287s.
-[triton-dejavu] First execution including JIT compilation took 0.5237414836883545s.
-[triton-dejavu] First execution including JIT compilation took 0.38099026679992676s.
-[triton-dejavu] First execution including JIT compilation took 0.25824856758117676s.
-[triton-dejavu] First execution including JIT compilation took 0.5798733234405518s.
-[triton-dejavu] First execution including JIT compilation took 0.4060328006744385s.
-[triton-dejavu] First execution including JIT compilation took 0.299180269241333s.
-[triton-dejavu] First execution including JIT compilation took 0.5705587863922119s.
-[triton-dejavu] First execution including JIT compilation took 0.43184709548950195s.
-[triton-dejavu] First execution including JIT compilation took 0.29991817474365234s.
-[triton-dejavu] First execution including JIT compilation took 0.5768892765045166s.
-[triton-dejavu] First execution including JIT compilation took 0.5104458332061768s.
-[triton-dejavu] First execution including JIT compilation took 0.36955881118774414s.
-[triton-dejavu] First execution including JIT compilation took 0.6489105224609375s.
-[triton-dejavu] First execution including JIT compilation took 0.5593419075012207s.
-[triton-dejavu] First execution including JIT compilation took 0.3752884864807129s.
-[triton-dejavu] First execution including JIT compilation took 0.6494286060333252s.
-[triton-dejavu] First execution including JIT compilation took 0.35906028747558594s.
-[triton-dejavu] First execution including JIT compilation took 0.2642378807067871s.
-[triton-dejavu] First execution including JIT compilation took 0.7536261081695557s.
-[triton-dejavu] First execution including JIT compilation took 0.4381115436553955s.
-[triton-dejavu] First execution including JIT compilation took 0.3165860176086426s.
-[triton-dejavu] First execution including JIT compilation took 1.265178918838501s.
-[triton-dejavu] First execution including JIT compilation took 0.5233526229858398s.
-[triton-dejavu] First execution including JIT compilation took 0.3574998378753662s.
-[triton-dejavu] First execution including JIT compilation took 1.300689697265625s.
-[triton-dejavu] First execution including JIT compilation took 0.6212594509124756s.
-[triton-dejavu] First execution including JIT compilation took 0.4114842414855957s.
-[triton-dejavu] First execution including JIT compilation took 1.3530914783477783s.
-[triton-dejavu] First execution including JIT compilation took 0.6589765548706055s.
-[triton-dejavu] First execution including JIT compilation took 0.4349792003631592s.
-[triton-dejavu] First execution including JIT compilation took 1.412661075592041s.
-[triton-dejavu] First execution including JIT compilation took 0.7154123783111572s.
-[triton-dejavu] First execution including JIT compilation took 0.5011796951293945s.
-bench_cudagraph failed with out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.3227477073669434s.
-[triton-dejavu] First execution including JIT compilation took 0.6530427932739258s.
-[triton-dejavu] First execution including JIT compilation took 0.35028672218322754s.
-[triton-dejavu] First execution including JIT compilation took 2.6119463443756104s.
-[triton-dejavu] First execution including JIT compilation took 0.884284257888794s.
-[triton-dejavu] First execution including JIT compilation took 0.5999755859375s.
-[triton-dejavu] First execution including JIT compilation took 6.0120015144348145s.
-[triton-dejavu] First execution including JIT compilation took 1.4350576400756836s.
-[triton-dejavu] First execution including JIT compilation took 0.6809098720550537s.
-[triton-dejavu] First execution including JIT compilation took 6.039306402206421s.
-[triton-dejavu] First execution including JIT compilation took 1.520536184310913s.
-[triton-dejavu] First execution including JIT compilation took 0.7370305061340332s.
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.29415297508239746s.
-[triton-dejavu] First execution including JIT compilation took 0.2148146629333496s.
-[triton-dejavu] First execution including JIT compilation took 0.20547103881835938s.
-[triton-dejavu] First execution including JIT compilation took 0.4116225242614746s.
-[triton-dejavu] First execution including JIT compilation took 0.27173876762390137s.
-[triton-dejavu] First execution including JIT compilation took 0.2047574520111084s.
-[triton-dejavu] First execution including JIT compilation took 0.4177091121673584s.
-[triton-dejavu] First execution including JIT compilation took 0.2812669277191162s.
-[triton-dejavu] First execution including JIT compilation took 0.25458741188049316s.
-[triton-dejavu] First execution including JIT compilation took 0.4544975757598877s.
-[triton-dejavu] First execution including JIT compilation took 0.30648016929626465s.
-[triton-dejavu] First execution including JIT compilation took 0.2235255241394043s.
-[triton-dejavu] First execution including JIT compilation took 0.47878551483154297s.
-[triton-dejavu] First execution including JIT compilation took 0.3247034549713135s.
-[triton-dejavu] First execution including JIT compilation took 0.2551548480987549s.
-[triton-dejavu] First execution including JIT compilation took 0.6796090602874756s.
-[triton-dejavu] First execution including JIT compilation took 0.3536677360534668s.
-[triton-dejavu] First execution including JIT compilation took 0.2720470428466797s.
-[triton-dejavu] First execution including JIT compilation took 0.8124823570251465s.
-[triton-dejavu] First execution including JIT compilation took 0.4978444576263428s.
-[triton-dejavu] First execution including JIT compilation took 0.35080695152282715s.
-[triton-dejavu] First execution including JIT compilation took 0.5138082504272461s.
-[triton-dejavu] First execution including JIT compilation took 0.3385753631591797s.
-[triton-dejavu] First execution including JIT compilation took 0.2594444751739502s.
-[triton-dejavu] First execution including JIT compilation took 0.6842827796936035s.
-[triton-dejavu] First execution including JIT compilation took 0.4294295310974121s.
-[triton-dejavu] First execution including JIT compilation took 0.3218364715576172s.
-[triton-dejavu] First execution including JIT compilation took 0.8480286598205566s.
-[triton-dejavu] First execution including JIT compilation took 0.472670316696167s.
-[triton-dejavu] First execution including JIT compilation took 0.36148762702941895s.
-[triton-dejavu] First execution including JIT compilation took 0.9478855133056641s.
-[triton-dejavu] First execution including JIT compilation took 0.5432147979736328s.
-[triton-dejavu] First execution including JIT compilation took 0.38411760330200195s.
-[triton-dejavu] First execution including JIT compilation took 1.0501837730407715s.
-[triton-dejavu] First execution including JIT compilation took 0.5907988548278809s.
-[triton-dejavu] First execution including JIT compilation took 0.39850473403930664s.
-[triton-dejavu] First execution including JIT compilation took 1.1722888946533203s.
-[triton-dejavu] First execution including JIT compilation took 0.6436972618103027s.
-[triton-dejavu] First execution including JIT compilation took 0.42680954933166504s.
-[triton-dejavu] First execution including JIT compilation took 1.3340017795562744s.
-[triton-dejavu] First execution including JIT compilation took 0.5718722343444824s.
-[triton-dejavu] First execution including JIT compilation took 0.38933897018432617s.
-[triton-dejavu] First execution including JIT compilation took 0.6949644088745117s.
-[triton-dejavu] First execution including JIT compilation took 0.3732309341430664s.
-[triton-dejavu] First execution including JIT compilation took 0.26645493507385254s.
-[triton-dejavu] First execution including JIT compilation took 0.6677834987640381s.
-[triton-dejavu] First execution including JIT compilation took 0.5330057144165039s.
-[triton-dejavu] First execution including JIT compilation took 0.3234426975250244s.
-[triton-dejavu] First execution including JIT compilation took 1.226240634918213s.
-[triton-dejavu] First execution including JIT compilation took 0.7037711143493652s.
-[triton-dejavu] First execution including JIT compilation took 0.35811614990234375s.
-[triton-dejavu] First execution including JIT compilation took 1.223371982574463s.
-[triton-dejavu] First execution including JIT compilation took 0.7030131816864014s.
-[triton-dejavu] First execution including JIT compilation took 0.44534802436828613s.
-[triton-dejavu] First execution including JIT compilation took 1.3601360321044922s.
-[triton-dejavu] First execution including JIT compilation took 0.8273930549621582s.
-[triton-dejavu] First execution including JIT compilation took 0.4698348045349121s.
-[triton-dejavu] First execution including JIT compilation took 1.3899588584899902s.
-[triton-dejavu] First execution including JIT compilation took 0.9071271419525146s.
-[triton-dejavu] First execution including JIT compilation took 0.47567152976989746s.
-bench_cudagraph failed with out of resource: shared memory, Required: 249088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.3848598003387451s.
-[triton-dejavu] First execution including JIT compilation took 0.5705807209014893s.
-[triton-dejavu] First execution including JIT compilation took 0.3730134963989258s.
-[triton-dejavu] First execution including JIT compilation took 1.7507147789001465s.
-[triton-dejavu] First execution including JIT compilation took 0.739149808883667s.
-[triton-dejavu] First execution including JIT compilation took 0.5719029903411865s.
-[triton-dejavu] First execution including JIT compilation took 5.391368865966797s.
-[triton-dejavu] First execution including JIT compilation took 1.2137444019317627s.
-[triton-dejavu] First execution including JIT compilation took 0.6304950714111328s.
-[triton-dejavu] First execution including JIT compilation took 5.735509157180786s.
-[triton-dejavu] First execution including JIT compilation took 1.278113603591919s.
-bench_cudagraph failed with out of resource: shared memory, Required: 279040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 279040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 350208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 350208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 421376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 421376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 563712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 563712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.086903095245361s.
-[triton-dejavu] First execution including JIT compilation took 1.3743896484375s.
-[triton-dejavu] First execution including JIT compilation took 0.5919761657714844s.
-[triton-dejavu] First execution including JIT compilation took 5.130200147628784s.
-[triton-dejavu] First execution including JIT compilation took 1.6156775951385498s.
-[triton-dejavu] First execution including JIT compilation took 0.8351900577545166s.
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.17352676391601562s.
-[triton-dejavu] First execution including JIT compilation took 0.16042280197143555s.
-[triton-dejavu] First execution including JIT compilation took 0.15305304527282715s.
-[triton-dejavu] First execution including JIT compilation took 0.17835307121276855s.
-[triton-dejavu] First execution including JIT compilation took 0.18025875091552734s.
-[triton-dejavu] First execution including JIT compilation took 0.1706397533416748s.
-[triton-dejavu] First execution including JIT compilation took 0.18474626541137695s.
-[triton-dejavu] First execution including JIT compilation took 0.1816706657409668s.
-[triton-dejavu] First execution including JIT compilation took 0.17277026176452637s.
-[triton-dejavu] First execution including JIT compilation took 0.19310617446899414s.
-[triton-dejavu] First execution including JIT compilation took 0.19066524505615234s.
-[triton-dejavu] First execution including JIT compilation took 0.18411707878112793s.
-[triton-dejavu] First execution including JIT compilation took 0.20400023460388184s.
-[triton-dejavu] First execution including JIT compilation took 0.21816563606262207s.
-[triton-dejavu] First execution including JIT compilation took 0.19965767860412598s.
-[triton-dejavu] First execution including JIT compilation took 0.23587703704833984s.
-[triton-dejavu] First execution including JIT compilation took 0.22630786895751953s.
-[triton-dejavu] First execution including JIT compilation took 0.20632338523864746s.
-[triton-dejavu] First execution including JIT compilation took 0.26639699935913086s.
-[triton-dejavu] First execution including JIT compilation took 0.24132418632507324s.
-[triton-dejavu] First execution including JIT compilation took 0.22358036041259766s.
-[triton-dejavu] First execution including JIT compilation took 0.1937398910522461s.
-[triton-dejavu] First execution including JIT compilation took 0.15766239166259766s.
-[triton-dejavu] First execution including JIT compilation took 0.1767878532409668s.
-[triton-dejavu] First execution including JIT compilation took 0.19301867485046387s.
-[triton-dejavu] First execution including JIT compilation took 0.19379115104675293s.
-[triton-dejavu] First execution including JIT compilation took 0.17388463020324707s.
-[triton-dejavu] First execution including JIT compilation took 0.2268962860107422s.
-[triton-dejavu] First execution including JIT compilation took 0.2022707462310791s.
-[triton-dejavu] First execution including JIT compilation took 0.19673395156860352s.
-[triton-dejavu] First execution including JIT compilation took 0.22176003456115723s.
-[triton-dejavu] First execution including JIT compilation took 0.20571017265319824s.
-[triton-dejavu] First execution including JIT compilation took 0.21979308128356934s.
-[triton-dejavu] First execution including JIT compilation took 0.23570489883422852s.
-[triton-dejavu] First execution including JIT compilation took 0.22661423683166504s.
-[triton-dejavu] First execution including JIT compilation took 0.22267603874206543s.
-[triton-dejavu] First execution including JIT compilation took 0.24321699142456055s.
-[triton-dejavu] First execution including JIT compilation took 0.23399901390075684s.
-[triton-dejavu] First execution including JIT compilation took 0.22303104400634766s.
-[triton-dejavu] First execution including JIT compilation took 0.28810644149780273s.
-[triton-dejavu] First execution including JIT compilation took 0.23264646530151367s.
-[triton-dejavu] First execution including JIT compilation took 0.2400953769683838s.
-[triton-dejavu] First execution including JIT compilation took 0.21341371536254883s.
-[triton-dejavu] First execution including JIT compilation took 0.19530224800109863s.
-[triton-dejavu] First execution including JIT compilation took 0.1753242015838623s.
-[triton-dejavu] First execution including JIT compilation took 0.24271106719970703s.
-[triton-dejavu] First execution including JIT compilation took 0.22337555885314941s.
-[triton-dejavu] First execution including JIT compilation took 0.20344924926757812s.
-[triton-dejavu] First execution including JIT compilation took 0.26564812660217285s.
-[triton-dejavu] First execution including JIT compilation took 0.22059965133666992s.
-[triton-dejavu] First execution including JIT compilation took 0.19876718521118164s.
-[triton-dejavu] First execution including JIT compilation took 0.3027980327606201s.
-[triton-dejavu] First execution including JIT compilation took 0.2440967559814453s.
-[triton-dejavu] First execution including JIT compilation took 0.21737980842590332s.
-[triton-dejavu] First execution including JIT compilation took 0.3214104175567627s.
-[triton-dejavu] First execution including JIT compilation took 0.23887038230895996s.
-[triton-dejavu] First execution including JIT compilation took 0.22879958152770996s.
-[triton-dejavu] First execution including JIT compilation took 0.31365513801574707s.
-[triton-dejavu] First execution including JIT compilation took 0.2629280090332031s.
-[triton-dejavu] First execution including JIT compilation took 0.22771596908569336s.
-[triton-dejavu] First execution including JIT compilation took 0.40690040588378906s.
-[triton-dejavu] First execution including JIT compilation took 0.32520389556884766s.
-[triton-dejavu] First execution including JIT compilation took 0.2640228271484375s.
-[triton-dejavu] First execution including JIT compilation took 0.2796199321746826s.
-[triton-dejavu] First execution including JIT compilation took 0.21073007583618164s.
-[triton-dejavu] First execution including JIT compilation took 0.1934361457824707s.
-[triton-dejavu] First execution including JIT compilation took 0.34839892387390137s.
-[triton-dejavu] First execution including JIT compilation took 0.3115088939666748s.
-[triton-dejavu] First execution including JIT compilation took 0.20244383811950684s.
-[triton-dejavu] First execution including JIT compilation took 0.38748598098754883s.
-[triton-dejavu] First execution including JIT compilation took 0.3139615058898926s.
-[triton-dejavu] First execution including JIT compilation took 0.22042202949523926s.
-[triton-dejavu] First execution including JIT compilation took 0.4271514415740967s.
-[triton-dejavu] First execution including JIT compilation took 0.34604549407958984s.
-[triton-dejavu] First execution including JIT compilation took 0.22012782096862793s.
-[triton-dejavu] First execution including JIT compilation took 0.5421981811523438s.
-[triton-dejavu] First execution including JIT compilation took 0.3638300895690918s.
-[triton-dejavu] First execution including JIT compilation took 0.23948025703430176s.
-[triton-dejavu] First execution including JIT compilation took 0.4606790542602539s.
-[triton-dejavu] First execution including JIT compilation took 0.3932468891143799s.
-[triton-dejavu] First execution including JIT compilation took 0.26195645332336426s.
-[triton-dejavu] First execution including JIT compilation took 0.5043284893035889s.
-[triton-dejavu] First execution including JIT compilation took 0.4588782787322998s.
-[triton-dejavu] First execution including JIT compilation took 0.2814829349517822s.
-[triton-dejavu] First execution including JIT compilation took 0.39888525009155273s.
-[triton-dejavu] First execution including JIT compilation took 0.2917821407318115s.
-[triton-dejavu] First execution including JIT compilation took 0.2808530330657959s.
-[triton-dejavu] First execution including JIT compilation took 0.6014502048492432s.
-[triton-dejavu] First execution including JIT compilation took 0.5006814002990723s.
-[triton-dejavu] First execution including JIT compilation took 0.4271118640899658s.
-[triton-dejavu] First execution including JIT compilation took 0.7629735469818115s.
-[triton-dejavu] First execution including JIT compilation took 0.49445056915283203s.
-[triton-dejavu] First execution including JIT compilation took 0.4509761333465576s.
-[triton-dejavu] First execution including JIT compilation took 0.7416894435882568s.
-[triton-dejavu] First execution including JIT compilation took 0.5215311050415039s.
-[triton-dejavu] First execution including JIT compilation took 0.4689524173736572s.
-[triton-dejavu] First execution including JIT compilation took 0.7672626972198486s.
-[triton-dejavu] First execution including JIT compilation took 0.6135916709899902s.
-[triton-dejavu] First execution including JIT compilation took 0.49275946617126465s.
-[triton-dejavu] First execution including JIT compilation took 0.9401953220367432s.
-[triton-dejavu] First execution including JIT compilation took 0.5827491283416748s.
-[triton-dejavu] First execution including JIT compilation took 0.4996645450592041s.
-[triton-dejavu] First execution including JIT compilation took 0.9401323795318604s.
-[triton-dejavu] First execution including JIT compilation took 0.668349027633667s.
-[triton-dejavu] First execution including JIT compilation took 0.5485968589782715s.
-[triton-dejavu] First execution including JIT compilation took 0.18768930435180664s.
-[triton-dejavu] First execution including JIT compilation took 0.16661763191223145s.
-[triton-dejavu] First execution including JIT compilation took 0.17052841186523438s.
-[triton-dejavu] First execution including JIT compilation took 0.2100682258605957s.
-[triton-dejavu] First execution including JIT compilation took 0.20937323570251465s.
-[triton-dejavu] First execution including JIT compilation took 0.19020938873291016s.
-[triton-dejavu] First execution including JIT compilation took 0.20560169219970703s.
-[triton-dejavu] First execution including JIT compilation took 0.19290709495544434s.
-[triton-dejavu] First execution including JIT compilation took 0.19777560234069824s.
-[triton-dejavu] First execution including JIT compilation took 0.21995210647583008s.
-[triton-dejavu] First execution including JIT compilation took 0.21872901916503906s.
-[triton-dejavu] First execution including JIT compilation took 0.2034306526184082s.
-[triton-dejavu] First execution including JIT compilation took 0.24239134788513184s.
-[triton-dejavu] First execution including JIT compilation took 0.26946043968200684s.
-[triton-dejavu] First execution including JIT compilation took 0.2544829845428467s.
-[triton-dejavu] First execution including JIT compilation took 0.2922830581665039s.
-[triton-dejavu] First execution including JIT compilation took 0.27474284172058105s.
-[triton-dejavu] First execution including JIT compilation took 0.2743556499481201s.
-[triton-dejavu] First execution including JIT compilation took 0.31418609619140625s.
-[triton-dejavu] First execution including JIT compilation took 0.30026888847351074s.
-[triton-dejavu] First execution including JIT compilation took 0.28441858291625977s.
-[triton-dejavu] First execution including JIT compilation took 0.23394203186035156s.
-[triton-dejavu] First execution including JIT compilation took 0.20772600173950195s.
-[triton-dejavu] First execution including JIT compilation took 0.18996429443359375s.
-[triton-dejavu] First execution including JIT compilation took 0.27438807487487793s.
-[triton-dejavu] First execution including JIT compilation took 0.23485589027404785s.
-[triton-dejavu] First execution including JIT compilation took 0.2199420928955078s.
-[triton-dejavu] First execution including JIT compilation took 0.29147887229919434s.
-[triton-dejavu] First execution including JIT compilation took 0.2452247142791748s.
-[triton-dejavu] First execution including JIT compilation took 0.2305736541748047s.
-[triton-dejavu] First execution including JIT compilation took 0.30902743339538574s.
-[triton-dejavu] First execution including JIT compilation took 0.2559957504272461s.
-[triton-dejavu] First execution including JIT compilation took 0.2465808391571045s.
-[triton-dejavu] First execution including JIT compilation took 0.3298933506011963s.
-[triton-dejavu] First execution including JIT compilation took 0.27321410179138184s.
-[triton-dejavu] First execution including JIT compilation took 0.26524877548217773s.
-[triton-dejavu] First execution including JIT compilation took 0.34816527366638184s.
-[triton-dejavu] First execution including JIT compilation took 0.28248119354248047s.
-[triton-dejavu] First execution including JIT compilation took 0.267411470413208s.
-[triton-dejavu] First execution including JIT compilation took 0.4036557674407959s.
-[triton-dejavu] First execution including JIT compilation took 0.30405187606811523s.
-[triton-dejavu] First execution including JIT compilation took 0.3068065643310547s.
-[triton-dejavu] First execution including JIT compilation took 0.2875032424926758s.
-[triton-dejavu] First execution including JIT compilation took 0.23735547065734863s.
-[triton-dejavu] First execution including JIT compilation took 0.23032617568969727s.
-[triton-dejavu] First execution including JIT compilation took 0.3493824005126953s.
-[triton-dejavu] First execution including JIT compilation took 0.27472662925720215s.
-[triton-dejavu] First execution including JIT compilation took 0.2401866912841797s.
-[triton-dejavu] First execution including JIT compilation took 0.39062976837158203s.
-[triton-dejavu] First execution including JIT compilation took 0.29250192642211914s.
-[triton-dejavu] First execution including JIT compilation took 0.2570502758026123s.
-[triton-dejavu] First execution including JIT compilation took 0.3952975273132324s.
-[triton-dejavu] First execution including JIT compilation took 0.31146764755249023s.
-[triton-dejavu] First execution including JIT compilation took 0.26107025146484375s.
-[triton-dejavu] First execution including JIT compilation took 0.43129730224609375s.
-[triton-dejavu] First execution including JIT compilation took 0.3286442756652832s.
-[triton-dejavu] First execution including JIT compilation took 0.2835230827331543s.
-[triton-dejavu] First execution including JIT compilation took 0.43753743171691895s.
-[triton-dejavu] First execution including JIT compilation took 0.34508848190307617s.
-[triton-dejavu] First execution including JIT compilation took 0.2861642837524414s.
-[triton-dejavu] First execution including JIT compilation took 0.48541975021362305s.
-[triton-dejavu] First execution including JIT compilation took 0.3953580856323242s.
-[triton-dejavu] First execution including JIT compilation took 0.31298136711120605s.
-[triton-dejavu] First execution including JIT compilation took 0.3618011474609375s.
-[triton-dejavu] First execution including JIT compilation took 0.29604053497314453s.
-[triton-dejavu] First execution including JIT compilation took 0.2307584285736084s.
-[triton-dejavu] First execution including JIT compilation took 0.4865717887878418s.
-[triton-dejavu] First execution including JIT compilation took 0.40287113189697266s.
-[triton-dejavu] First execution including JIT compilation took 0.27056026458740234s.
-[triton-dejavu] First execution including JIT compilation took 0.5447485446929932s.
-[triton-dejavu] First execution including JIT compilation took 0.430034875869751s.
-[triton-dejavu] First execution including JIT compilation took 0.30031275749206543s.
-[triton-dejavu] First execution including JIT compilation took 0.6063799858093262s.
-[triton-dejavu] First execution including JIT compilation took 0.4732537269592285s.
-[triton-dejavu] First execution including JIT compilation took 0.3194100856781006s.
-[triton-dejavu] First execution including JIT compilation took 0.6529583930969238s.
-[triton-dejavu] First execution including JIT compilation took 0.4868447780609131s.
-[triton-dejavu] First execution including JIT compilation took 0.35962390899658203s.
-[triton-dejavu] First execution including JIT compilation took 0.6952221393585205s.
-[triton-dejavu] First execution including JIT compilation took 0.5078432559967041s.
-[triton-dejavu] First execution including JIT compilation took 0.3716623783111572s.
-[triton-dejavu] First execution including JIT compilation took 0.7688384056091309s.
-[triton-dejavu] First execution including JIT compilation took 0.5738773345947266s.
-[triton-dejavu] First execution including JIT compilation took 0.40444135665893555s.
-[triton-dejavu] First execution including JIT compilation took 0.5380966663360596s.
-[triton-dejavu] First execution including JIT compilation took 0.4179868698120117s.
-[triton-dejavu] First execution including JIT compilation took 0.2959005832672119s.
-[triton-dejavu] First execution including JIT compilation took 0.8164780139923096s.
-[triton-dejavu] First execution including JIT compilation took 0.6937565803527832s.
-[triton-dejavu] First execution including JIT compilation took 0.49874210357666016s.
-[triton-dejavu] First execution including JIT compilation took 1.0514369010925293s.
-[triton-dejavu] First execution including JIT compilation took 0.7419230937957764s.
-[triton-dejavu] First execution including JIT compilation took 0.5654633045196533s.
-[triton-dejavu] First execution including JIT compilation took 1.0287201404571533s.
-[triton-dejavu] First execution including JIT compilation took 0.7904648780822754s.
-[triton-dejavu] First execution including JIT compilation took 0.5895709991455078s.
-[triton-dejavu] First execution including JIT compilation took 1.1943697929382324s.
-[triton-dejavu] First execution including JIT compilation took 0.8456614017486572s.
-[triton-dejavu] First execution including JIT compilation took 0.6484768390655518s.
-[triton-dejavu] First execution including JIT compilation took 1.2749860286712646s.
-[triton-dejavu] First execution including JIT compilation took 0.906667947769165s.
-[triton-dejavu] First execution including JIT compilation took 0.6650793552398682s.
-bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.2316875457763672s.
-[triton-dejavu] First execution including JIT compilation took 0.17262887954711914s.
-[triton-dejavu] First execution including JIT compilation took 0.16709303855895996s.
-[triton-dejavu] First execution including JIT compilation took 0.2316112518310547s.
-[triton-dejavu] First execution including JIT compilation took 0.19475674629211426s.
-[triton-dejavu] First execution including JIT compilation took 0.2070786952972412s.
-[triton-dejavu] First execution including JIT compilation took 0.26039981842041016s.
-[triton-dejavu] First execution including JIT compilation took 0.20882773399353027s.
-[triton-dejavu] First execution including JIT compilation took 0.20374321937561035s.
-[triton-dejavu] First execution including JIT compilation took 0.27775073051452637s.
-[triton-dejavu] First execution including JIT compilation took 0.2467350959777832s.
-[triton-dejavu] First execution including JIT compilation took 0.21544861793518066s.
-[triton-dejavu] First execution including JIT compilation took 0.32980918884277344s.
-[triton-dejavu] First execution including JIT compilation took 0.21561956405639648s.
-[triton-dejavu] First execution including JIT compilation took 0.2177584171295166s.
-[triton-dejavu] First execution including JIT compilation took 0.32271885871887207s.
-[triton-dejavu] First execution including JIT compilation took 0.24530315399169922s.
-[triton-dejavu] First execution including JIT compilation took 0.2153329849243164s.
-[triton-dejavu] First execution including JIT compilation took 0.3502006530761719s.
-[triton-dejavu] First execution including JIT compilation took 0.28928208351135254s.
-[triton-dejavu] First execution including JIT compilation took 0.22752094268798828s.
-[triton-dejavu] First execution including JIT compilation took 0.22658753395080566s.
-[triton-dejavu] First execution including JIT compilation took 0.19278335571289062s.
-[triton-dejavu] First execution including JIT compilation took 0.18082761764526367s.
-[triton-dejavu] First execution including JIT compilation took 0.2848987579345703s.
-[triton-dejavu] First execution including JIT compilation took 0.23020219802856445s.
-[triton-dejavu] First execution including JIT compilation took 0.18162894248962402s.
-[triton-dejavu] First execution including JIT compilation took 0.37184619903564453s.
-[triton-dejavu] First execution including JIT compilation took 0.29797792434692383s.
-[triton-dejavu] First execution including JIT compilation took 0.28612470626831055s.
-[triton-dejavu] First execution including JIT compilation took 0.4051649570465088s.
-[triton-dejavu] First execution including JIT compilation took 0.32303476333618164s.
-[triton-dejavu] First execution including JIT compilation took 0.2697916030883789s.
-[triton-dejavu] First execution including JIT compilation took 0.4405784606933594s.
-[triton-dejavu] First execution including JIT compilation took 0.34795689582824707s.
-[triton-dejavu] First execution including JIT compilation took 0.2898232936859131s.
-[triton-dejavu] First execution including JIT compilation took 0.4761343002319336s.
-[triton-dejavu] First execution including JIT compilation took 0.36168718338012695s.
-[triton-dejavu] First execution including JIT compilation took 0.28768467903137207s.
-[triton-dejavu] First execution including JIT compilation took 0.5420176982879639s.
-[triton-dejavu] First execution including JIT compilation took 0.38568615913391113s.
-[triton-dejavu] First execution including JIT compilation took 0.3170638084411621s.
-[triton-dejavu] First execution including JIT compilation took 0.35248708724975586s.
-[triton-dejavu] First execution including JIT compilation took 0.273029088973999s.
-[triton-dejavu] First execution including JIT compilation took 0.23226165771484375s.
-[triton-dejavu] First execution including JIT compilation took 0.41886162757873535s.
-[triton-dejavu] First execution including JIT compilation took 0.3393113613128662s.
-[triton-dejavu] First execution including JIT compilation took 0.26583361625671387s.
-[triton-dejavu] First execution including JIT compilation took 0.46086597442626953s.
-[triton-dejavu] First execution including JIT compilation took 0.3681511878967285s.
-[triton-dejavu] First execution including JIT compilation took 0.28913354873657227s.
-[triton-dejavu] First execution including JIT compilation took 0.49338269233703613s.
-[triton-dejavu] First execution including JIT compilation took 0.39551806449890137s.
-[triton-dejavu] First execution including JIT compilation took 0.3077273368835449s.
-[triton-dejavu] First execution including JIT compilation took 0.5245680809020996s.
-[triton-dejavu] First execution including JIT compilation took 0.4535055160522461s.
-[triton-dejavu] First execution including JIT compilation took 0.32816529273986816s.
-[triton-dejavu] First execution including JIT compilation took 0.563164234161377s.
-[triton-dejavu] First execution including JIT compilation took 0.4629950523376465s.
-[triton-dejavu] First execution including JIT compilation took 0.34805798530578613s.
-[triton-dejavu] First execution including JIT compilation took 0.620722770690918s.
-[triton-dejavu] First execution including JIT compilation took 0.5199508666992188s.
-[triton-dejavu] First execution including JIT compilation took 0.38794612884521484s.
-[triton-dejavu] First execution including JIT compilation took 0.5147454738616943s.
-[triton-dejavu] First execution including JIT compilation took 0.3552286624908447s.
-[triton-dejavu] First execution including JIT compilation took 0.28772640228271484s.
-[triton-dejavu] First execution including JIT compilation took 0.6648948192596436s.
-[triton-dejavu] First execution including JIT compilation took 0.47719812393188477s.
-[triton-dejavu] First execution including JIT compilation took 0.34389352798461914s.
-[triton-dejavu] First execution including JIT compilation took 0.767352819442749s.
-[triton-dejavu] First execution including JIT compilation took 0.5330626964569092s.
-[triton-dejavu] First execution including JIT compilation took 0.37920188903808594s.
-[triton-dejavu] First execution including JIT compilation took 0.7848920822143555s.
-[triton-dejavu] First execution including JIT compilation took 0.47530531883239746s.
-[triton-dejavu] First execution including JIT compilation took 0.33605313301086426s.
-[triton-dejavu] First execution including JIT compilation took 0.7004132270812988s.
-[triton-dejavu] First execution including JIT compilation took 0.4657857418060303s.
-[triton-dejavu] First execution including JIT compilation took 0.3541529178619385s.
-[triton-dejavu] First execution including JIT compilation took 0.7426049709320068s.
-[triton-dejavu] First execution including JIT compilation took 0.538907527923584s.
-[triton-dejavu] First execution including JIT compilation took 0.3655426502227783s.
-[triton-dejavu] First execution including JIT compilation took 0.8675262928009033s.
-[triton-dejavu] First execution including JIT compilation took 0.5515866279602051s.
-[triton-dejavu] First execution including JIT compilation took 0.4889793395996094s.
-[triton-dejavu] First execution including JIT compilation took 0.9458072185516357s.
-[triton-dejavu] First execution including JIT compilation took 0.5496277809143066s.
-[triton-dejavu] First execution including JIT compilation took 0.3970763683319092s.
-[triton-dejavu] First execution including JIT compilation took 1.425358772277832s.
-[triton-dejavu] First execution including JIT compilation took 0.8153319358825684s.
-[triton-dejavu] First execution including JIT compilation took 0.6550781726837158s.
-[triton-dejavu] First execution including JIT compilation took 2.0274274349212646s.
-[triton-dejavu] First execution including JIT compilation took 0.9665567874908447s.
-[triton-dejavu] First execution including JIT compilation took 0.7323272228240967s.
-[triton-dejavu] First execution including JIT compilation took 2.0035274028778076s.
-[triton-dejavu] First execution including JIT compilation took 0.794562578201294s.
-[triton-dejavu] First execution including JIT compilation took 0.5924549102783203s.
-[triton-dejavu] First execution including JIT compilation took 1.8792035579681396s.
-[triton-dejavu] First execution including JIT compilation took 0.8279719352722168s.
-[triton-dejavu] First execution including JIT compilation took 0.5921733379364014s.
-bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.25391125679016113s.
-[triton-dejavu] First execution including JIT compilation took 0.20267081260681152s.
-[triton-dejavu] First execution including JIT compilation took 0.16582441329956055s.
-[triton-dejavu] First execution including JIT compilation took 0.2948935031890869s.
-[triton-dejavu] First execution including JIT compilation took 0.22772979736328125s.
-[triton-dejavu] First execution including JIT compilation took 0.18806099891662598s.
-[triton-dejavu] First execution including JIT compilation took 0.3108959197998047s.
-[triton-dejavu] First execution including JIT compilation took 0.344806432723999s.
-[triton-dejavu] First execution including JIT compilation took 0.20200371742248535s.
-[triton-dejavu] First execution including JIT compilation took 0.3476550579071045s.
-[triton-dejavu] First execution including JIT compilation took 0.2549312114715576s.
-[triton-dejavu] First execution including JIT compilation took 0.22494268417358398s.
-[triton-dejavu] First execution including JIT compilation took 0.37220120429992676s.
-[triton-dejavu] First execution including JIT compilation took 0.2809004783630371s.
-[triton-dejavu] First execution including JIT compilation took 0.22059869766235352s.
-[triton-dejavu] First execution including JIT compilation took 0.388761043548584s.
-[triton-dejavu] First execution including JIT compilation took 0.3077573776245117s.
-[triton-dejavu] First execution including JIT compilation took 0.28969645500183105s.
-[triton-dejavu] First execution including JIT compilation took 0.4982035160064697s.
-[triton-dejavu] First execution including JIT compilation took 0.39230942726135254s.
-[triton-dejavu] First execution including JIT compilation took 0.2644228935241699s.
-[triton-dejavu] First execution including JIT compilation took 0.3611593246459961s.
-[triton-dejavu] First execution including JIT compilation took 0.2406003475189209s.
-[triton-dejavu] First execution including JIT compilation took 0.20929169654846191s.
-[triton-dejavu] First execution including JIT compilation took 0.4092109203338623s.
-[triton-dejavu] First execution including JIT compilation took 0.2963707447052002s.
-[triton-dejavu] First execution including JIT compilation took 0.2378685474395752s.
-[triton-dejavu] First execution including JIT compilation took 0.45641469955444336s.
-[triton-dejavu] First execution including JIT compilation took 0.32480573654174805s.
-[triton-dejavu] First execution including JIT compilation took 0.2426598072052002s.
-[triton-dejavu] First execution including JIT compilation took 0.5122194290161133s.
-[triton-dejavu] First execution including JIT compilation took 0.3030378818511963s.
-[triton-dejavu] First execution including JIT compilation took 0.24106526374816895s.
-[triton-dejavu] First execution including JIT compilation took 0.4959719181060791s.
-[triton-dejavu] First execution including JIT compilation took 0.4293406009674072s.
-[triton-dejavu] First execution including JIT compilation took 0.32636475563049316s.
-[triton-dejavu] First execution including JIT compilation took 0.7202773094177246s.
-[triton-dejavu] First execution including JIT compilation took 0.4574899673461914s.
-[triton-dejavu] First execution including JIT compilation took 0.3512580394744873s.
-[triton-dejavu] First execution including JIT compilation took 0.8339159488677979s.
-[triton-dejavu] First execution including JIT compilation took 0.5235207080841064s.
-[triton-dejavu] First execution including JIT compilation took 0.3777124881744385s.
-[triton-dejavu] First execution including JIT compilation took 0.5410404205322266s.
-[triton-dejavu] First execution including JIT compilation took 0.37018918991088867s.
-[triton-dejavu] First execution including JIT compilation took 0.27248096466064453s.
-[triton-dejavu] First execution including JIT compilation took 0.6164650917053223s.
-[triton-dejavu] First execution including JIT compilation took 0.4583768844604492s.
-[triton-dejavu] First execution including JIT compilation took 0.32603883743286133s.
-[triton-dejavu] First execution including JIT compilation took 0.7062644958496094s.
-[triton-dejavu] First execution including JIT compilation took 0.515678882598877s.
-[triton-dejavu] First execution including JIT compilation took 0.36606812477111816s.
-[triton-dejavu] First execution including JIT compilation took 0.7721257209777832s.
-[triton-dejavu] First execution including JIT compilation took 0.5739889144897461s.
-[triton-dejavu] First execution including JIT compilation took 0.4048495292663574s.
-[triton-dejavu] First execution including JIT compilation took 0.8282396793365479s.
-[triton-dejavu] First execution including JIT compilation took 0.5025653839111328s.
-[triton-dejavu] First execution including JIT compilation took 0.33838868141174316s.
-[triton-dejavu] First execution including JIT compilation took 0.7806441783905029s.
-[triton-dejavu] First execution including JIT compilation took 0.6090381145477295s.
-[triton-dejavu] First execution including JIT compilation took 0.3753626346588135s.
-[triton-dejavu] First execution including JIT compilation took 0.7488856315612793s.
-[triton-dejavu] First execution including JIT compilation took 0.7003397941589355s.
-[triton-dejavu] First execution including JIT compilation took 0.41066956520080566s.
-[triton-dejavu] First execution including JIT compilation took 0.7540671825408936s.
-[triton-dejavu] First execution including JIT compilation took 0.4108397960662842s.
-[triton-dejavu] First execution including JIT compilation took 0.28084588050842285s.
-[triton-dejavu] First execution including JIT compilation took 0.8834891319274902s.
-[triton-dejavu] First execution including JIT compilation took 0.49424219131469727s.
-[triton-dejavu] First execution including JIT compilation took 0.39174604415893555s.
-[triton-dejavu] First execution including JIT compilation took 1.3143653869628906s.
-[triton-dejavu] First execution including JIT compilation took 0.646043062210083s.
-[triton-dejavu] First execution including JIT compilation took 0.626563549041748s.
-[triton-dejavu] First execution including JIT compilation took 1.4293315410614014s.
-[triton-dejavu] First execution including JIT compilation took 0.6236376762390137s.
-[triton-dejavu] First execution including JIT compilation took 0.48520755767822266s.
-[triton-dejavu] First execution including JIT compilation took 1.6379265785217285s.
-[triton-dejavu] First execution including JIT compilation took 0.7366843223571777s.
-[triton-dejavu] First execution including JIT compilation took 0.4963555335998535s.
-[triton-dejavu] First execution including JIT compilation took 1.574018955230713s.
-[triton-dejavu] First execution including JIT compilation took 0.7276029586791992s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.557365894317627s.
-[triton-dejavu] First execution including JIT compilation took 0.6995244026184082s.
-[triton-dejavu] First execution including JIT compilation took 0.42113590240478516s.
-[triton-dejavu] First execution including JIT compilation took 2.4745399951934814s.
-[triton-dejavu] First execution including JIT compilation took 0.9797840118408203s.
-[triton-dejavu] First execution including JIT compilation took 0.6241872310638428s.
-[triton-dejavu] First execution including JIT compilation took 6.13117790222168s.
-[triton-dejavu] First execution including JIT compilation took 1.4725189208984375s.
-[triton-dejavu] First execution including JIT compilation took 0.681943416595459s.
-bench_cudagraph failed with out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.3794093132019043s.
-[triton-dejavu] First execution including JIT compilation took 0.28264904022216797s.
-[triton-dejavu] First execution including JIT compilation took 0.22452759742736816s.
-[triton-dejavu] First execution including JIT compilation took 0.559002161026001s.
-[triton-dejavu] First execution including JIT compilation took 0.3224790096282959s.
-[triton-dejavu] First execution including JIT compilation took 0.28389406204223633s.
-[triton-dejavu] First execution including JIT compilation took 0.5182971954345703s.
-[triton-dejavu] First execution including JIT compilation took 0.345487117767334s.
-[triton-dejavu] First execution including JIT compilation took 0.2435622215270996s.
-[triton-dejavu] First execution including JIT compilation took 0.5336036682128906s.
-[triton-dejavu] First execution including JIT compilation took 0.3894071578979492s.
-[triton-dejavu] First execution including JIT compilation took 0.3008308410644531s.
-[triton-dejavu] First execution including JIT compilation took 0.7498984336853027s.
-[triton-dejavu] First execution including JIT compilation took 0.41705965995788574s.
-[triton-dejavu] First execution including JIT compilation took 0.2856142520904541s.
-[triton-dejavu] First execution including JIT compilation took 0.7986507415771484s.
-[triton-dejavu] First execution including JIT compilation took 0.506192684173584s.
-[triton-dejavu] First execution including JIT compilation took 0.35767054557800293s.
-[triton-dejavu] First execution including JIT compilation took 0.9271838665008545s.
-[triton-dejavu] First execution including JIT compilation took 0.5614745616912842s.
-[triton-dejavu] First execution including JIT compilation took 0.39832496643066406s.
-[triton-dejavu] First execution including JIT compilation took 0.6550092697143555s.
-[triton-dejavu] First execution including JIT compilation took 0.4102933406829834s.
-[triton-dejavu] First execution including JIT compilation took 0.28809452056884766s.
-[triton-dejavu] First execution including JIT compilation took 0.8442857265472412s.
-[triton-dejavu] First execution including JIT compilation took 0.49399375915527344s.
-[triton-dejavu] First execution including JIT compilation took 0.3414480686187744s.
-[triton-dejavu] First execution including JIT compilation took 0.9948995113372803s.
-[triton-dejavu] First execution including JIT compilation took 0.544846773147583s.
-[triton-dejavu] First execution including JIT compilation took 0.36998677253723145s.
-[triton-dejavu] First execution including JIT compilation took 1.1347663402557373s.
-[triton-dejavu] First execution including JIT compilation took 0.5956213474273682s.
-[triton-dejavu] First execution including JIT compilation took 0.41924381256103516s.
-[triton-dejavu] First execution including JIT compilation took 1.2498164176940918s.
-[triton-dejavu] First execution including JIT compilation took 0.6886944770812988s.
-[triton-dejavu] First execution including JIT compilation took 0.45352959632873535s.
-[triton-dejavu] First execution including JIT compilation took 1.3488807678222656s.
-[triton-dejavu] First execution including JIT compilation took 0.7345826625823975s.
-[triton-dejavu] First execution including JIT compilation took 0.4611852169036865s.
-[triton-dejavu] First execution including JIT compilation took 1.6846129894256592s.
-[triton-dejavu] First execution including JIT compilation took 0.8527877330780029s.
-[triton-dejavu] First execution including JIT compilation took 0.519357442855835s.
-[triton-dejavu] First execution including JIT compilation took 0.9926292896270752s.
-[triton-dejavu] First execution including JIT compilation took 0.5671131610870361s.
-[triton-dejavu] First execution including JIT compilation took 0.36908459663391113s.
-[triton-dejavu] First execution including JIT compilation took 1.1392111778259277s.
-[triton-dejavu] First execution including JIT compilation took 0.7338624000549316s.
-[triton-dejavu] First execution including JIT compilation took 0.37808799743652344s.
-[triton-dejavu] First execution including JIT compilation took 1.32969069480896s.
-[triton-dejavu] First execution including JIT compilation took 0.7195644378662109s.
-[triton-dejavu] First execution including JIT compilation took 0.43347692489624023s.
-[triton-dejavu] First execution including JIT compilation took 1.5576729774475098s.
-[triton-dejavu] First execution including JIT compilation took 0.780888557434082s.
-[triton-dejavu] First execution including JIT compilation took 0.5686335563659668s.
-[triton-dejavu] First execution including JIT compilation took 1.5757191181182861s.
-[triton-dejavu] First execution including JIT compilation took 1.1339452266693115s.
-[triton-dejavu] First execution including JIT compilation took 0.6171472072601318s.
-[triton-dejavu] First execution including JIT compilation took 1.9367270469665527s.
-[triton-dejavu] First execution including JIT compilation took 1.2703828811645508s.
-[triton-dejavu] First execution including JIT compilation took 0.6219308376312256s.
-bench_cudagraph failed with out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.229659080505371s.
-[triton-dejavu] First execution including JIT compilation took 0.6827125549316406s.
-[triton-dejavu] First execution including JIT compilation took 0.4279639720916748s.
-[triton-dejavu] First execution including JIT compilation took 2.214158535003662s.
-[triton-dejavu] First execution including JIT compilation took 0.847602367401123s.
-[triton-dejavu] First execution including JIT compilation took 0.5684854984283447s.
-[triton-dejavu] First execution including JIT compilation took 5.671643257141113s.
-[triton-dejavu] First execution including JIT compilation took 1.3386998176574707s.
-[triton-dejavu] First execution including JIT compilation took 0.7006118297576904s.
-[triton-dejavu] First execution including JIT compilation took 6.009850025177002s.
-[triton-dejavu] First execution including JIT compilation took 1.425264596939087s.
-bench_cudagraph failed with out of resource: shared memory, Required: 291328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 291328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 441856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 441856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 592384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 592384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.9762208461761475s.
-[triton-dejavu] First execution including JIT compilation took 1.5692176818847656s.
-[triton-dejavu] First execution including JIT compilation took 0.7641324996948242s.
-[triton-dejavu] First execution including JIT compilation took 6.608908176422119s.
-[triton-dejavu] First execution including JIT compilation took 2.132209062576294s.
-[triton-dejavu] First execution including JIT compilation took 0.9537761211395264s.
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.1910562515258789s.
-[triton-dejavu] First execution including JIT compilation took 0.17246103286743164s.
-[triton-dejavu] First execution including JIT compilation took 0.17216134071350098s.
-[triton-dejavu] First execution including JIT compilation took 0.2674295902252197s.
-[triton-dejavu] First execution including JIT compilation took 0.2252979278564453s.
-[triton-dejavu] First execution including JIT compilation took 0.19235920906066895s.
-[triton-dejavu] First execution including JIT compilation took 0.21264386177062988s.
-[triton-dejavu] First execution including JIT compilation took 0.2161550521850586s.
-[triton-dejavu] First execution including JIT compilation took 0.1922304630279541s.
-[triton-dejavu] First execution including JIT compilation took 0.25547218322753906s.
-[triton-dejavu] First execution including JIT compilation took 0.27811694145202637s.
-[triton-dejavu] First execution including JIT compilation took 0.23806548118591309s.
-[triton-dejavu] First execution including JIT compilation took 0.3011937141418457s.
-[triton-dejavu] First execution including JIT compilation took 0.2934706211090088s.
-[triton-dejavu] First execution including JIT compilation took 0.25322985649108887s.
-[triton-dejavu] First execution including JIT compilation took 0.3128054141998291s.
-[triton-dejavu] First execution including JIT compilation took 0.28246212005615234s.
-[triton-dejavu] First execution including JIT compilation took 0.2546539306640625s.
-[triton-dejavu] First execution including JIT compilation took 0.33473944664001465s.
-[triton-dejavu] First execution including JIT compilation took 0.30516982078552246s.
-[triton-dejavu] First execution including JIT compilation took 0.27489733695983887s.
-[triton-dejavu] First execution including JIT compilation took 0.24262332916259766s.
-[triton-dejavu] First execution including JIT compilation took 0.2100353240966797s.
-[triton-dejavu] First execution including JIT compilation took 0.19793057441711426s.
-[triton-dejavu] First execution including JIT compilation took 0.2780025005340576s.
-[triton-dejavu] First execution including JIT compilation took 0.24424457550048828s.
-[triton-dejavu] First execution including JIT compilation took 0.231339693069458s.
-[triton-dejavu] First execution including JIT compilation took 0.29887890815734863s.
-[triton-dejavu] First execution including JIT compilation took 0.2637321949005127s.
-[triton-dejavu] First execution including JIT compilation took 0.24405384063720703s.
-[triton-dejavu] First execution including JIT compilation took 0.32925963401794434s.
-[triton-dejavu] First execution including JIT compilation took 0.28090524673461914s.
-[triton-dejavu] First execution including JIT compilation took 0.2658822536468506s.
-[triton-dejavu] First execution including JIT compilation took 0.34981393814086914s.
-[triton-dejavu] First execution including JIT compilation took 0.2969369888305664s.
-[triton-dejavu] First execution including JIT compilation took 0.273942232131958s.
-[triton-dejavu] First execution including JIT compilation took 0.37868213653564453s.
-[triton-dejavu] First execution including JIT compilation took 0.33127617835998535s.
-[triton-dejavu] First execution including JIT compilation took 0.3416633605957031s.
-[triton-dejavu] First execution including JIT compilation took 0.41475677490234375s.
-[triton-dejavu] First execution including JIT compilation took 0.33086156845092773s.
-[triton-dejavu] First execution including JIT compilation took 0.3177492618560791s.
-[triton-dejavu] First execution including JIT compilation took 0.3063650131225586s.
-[triton-dejavu] First execution including JIT compilation took 0.23031854629516602s.
-[triton-dejavu] First execution including JIT compilation took 0.21300745010375977s.
-[triton-dejavu] First execution including JIT compilation took 0.38768625259399414s.
-[triton-dejavu] First execution including JIT compilation took 0.2662017345428467s.
-[triton-dejavu] First execution including JIT compilation took 0.24376845359802246s.
-[triton-dejavu] First execution including JIT compilation took 0.42224621772766113s.
-[triton-dejavu] First execution including JIT compilation took 0.28191328048706055s.
-[triton-dejavu] First execution including JIT compilation took 0.273775577545166s.
-[triton-dejavu] First execution including JIT compilation took 0.4455993175506592s.
-[triton-dejavu] First execution including JIT compilation took 0.3689110279083252s.
-[triton-dejavu] First execution including JIT compilation took 0.26688480377197266s.
-[triton-dejavu] First execution including JIT compilation took 0.4688987731933594s.
-[triton-dejavu] First execution including JIT compilation took 0.31668877601623535s.
-[triton-dejavu] First execution including JIT compilation took 0.2852771282196045s.
-[triton-dejavu] First execution including JIT compilation took 0.5058488845825195s.
-[triton-dejavu] First execution including JIT compilation took 0.33969998359680176s.
-[triton-dejavu] First execution including JIT compilation took 0.3043205738067627s.
-[triton-dejavu] First execution including JIT compilation took 0.5594408512115479s.
-[triton-dejavu] First execution including JIT compilation took 0.38538432121276855s.
-[triton-dejavu] First execution including JIT compilation took 0.40354394912719727s.
-[triton-dejavu] First execution including JIT compilation took 0.4203341007232666s.
-[triton-dejavu] First execution including JIT compilation took 0.2790985107421875s.
-[triton-dejavu] First execution including JIT compilation took 0.197509765625s.
-[triton-dejavu] First execution including JIT compilation took 0.5050961971282959s.
-[triton-dejavu] First execution including JIT compilation took 0.2615811824798584s.
-[triton-dejavu] First execution including JIT compilation took 0.23754334449768066s.
-[triton-dejavu] First execution including JIT compilation took 0.5479357242584229s.
-[triton-dejavu] First execution including JIT compilation took 0.29597973823547363s.
-[triton-dejavu] First execution including JIT compilation took 0.22592473030090332s.
-[triton-dejavu] First execution including JIT compilation took 0.5904271602630615s.
-[triton-dejavu] First execution including JIT compilation took 0.3177652359008789s.
-[triton-dejavu] First execution including JIT compilation took 0.23325729370117188s.
-[triton-dejavu] First execution including JIT compilation took 0.6337690353393555s.
-[triton-dejavu] First execution including JIT compilation took 0.3158242702484131s.
-[triton-dejavu] First execution including JIT compilation took 0.26456284523010254s.
-[triton-dejavu] First execution including JIT compilation took 0.6728482246398926s.
-[triton-dejavu] First execution including JIT compilation took 0.3370821475982666s.
-[triton-dejavu] First execution including JIT compilation took 0.27890753746032715s.
-[triton-dejavu] First execution including JIT compilation took 0.7555828094482422s.
-[triton-dejavu] First execution including JIT compilation took 0.47994327545166016s.
-[triton-dejavu] First execution including JIT compilation took 0.3138282299041748s.
-[triton-dejavu] First execution including JIT compilation took 0.6586263179779053s.
-[triton-dejavu] First execution including JIT compilation took 0.2855665683746338s.
-[triton-dejavu] First execution including JIT compilation took 0.21575546264648438s.
-[triton-dejavu] First execution including JIT compilation took 0.8698668479919434s.
-[triton-dejavu] First execution including JIT compilation took 0.326815128326416s.
-[triton-dejavu] First execution including JIT compilation took 0.24704337120056152s.
-[triton-dejavu] First execution including JIT compilation took 1.3291542530059814s.
-[triton-dejavu] First execution including JIT compilation took 0.41158032417297363s.
-[triton-dejavu] First execution including JIT compilation took 0.2945075035095215s.
-[triton-dejavu] First execution including JIT compilation took 1.4427604675292969s.
-[triton-dejavu] First execution including JIT compilation took 0.4566466808319092s.
-[triton-dejavu] First execution including JIT compilation took 0.35230016708374023s.
-[triton-dejavu] First execution including JIT compilation took 1.5283832550048828s.
-[triton-dejavu] First execution including JIT compilation took 0.822779655456543s.
-[triton-dejavu] First execution including JIT compilation took 0.400043249130249s.
-[triton-dejavu] First execution including JIT compilation took 1.59427809715271s.
-bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 308224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.19841480255126953s.
-[triton-dejavu] First execution including JIT compilation took 0.19194793701171875s.
-[triton-dejavu] First execution including JIT compilation took 0.1582164764404297s.
-[triton-dejavu] First execution including JIT compilation took 0.24194097518920898s.
-[triton-dejavu] First execution including JIT compilation took 0.21207785606384277s.
-[triton-dejavu] First execution including JIT compilation took 0.19394159317016602s.
-[triton-dejavu] First execution including JIT compilation took 0.24550700187683105s.
-[triton-dejavu] First execution including JIT compilation took 0.21821212768554688s.
-[triton-dejavu] First execution including JIT compilation took 0.18725252151489258s.
-[triton-dejavu] First execution including JIT compilation took 0.26776623725891113s.
-[triton-dejavu] First execution including JIT compilation took 0.20471405982971191s.
-[triton-dejavu] First execution including JIT compilation took 0.20141196250915527s.
-[triton-dejavu] First execution including JIT compilation took 0.26976442337036133s.
-[triton-dejavu] First execution including JIT compilation took 0.24028730392456055s.
-[triton-dejavu] First execution including JIT compilation took 0.23756718635559082s.
-[triton-dejavu] First execution including JIT compilation took 0.2599141597747803s.
-[triton-dejavu] First execution including JIT compilation took 0.23916363716125488s.
-[triton-dejavu] First execution including JIT compilation took 0.21816468238830566s.
-[triton-dejavu] First execution including JIT compilation took 0.28762292861938477s.
-[triton-dejavu] First execution including JIT compilation took 0.2479848861694336s.
-[triton-dejavu] First execution including JIT compilation took 0.25420284271240234s.
-[triton-dejavu] First execution including JIT compilation took 0.2362511157989502s.
-[triton-dejavu] First execution including JIT compilation took 0.18312764167785645s.
-[triton-dejavu] First execution including JIT compilation took 0.17608380317687988s.
-[triton-dejavu] First execution including JIT compilation took 0.2786374092102051s.
-[triton-dejavu] First execution including JIT compilation took 0.21152758598327637s.
-[triton-dejavu] First execution including JIT compilation took 0.20641374588012695s.
-[triton-dejavu] First execution including JIT compilation took 0.30803728103637695s.
-[triton-dejavu] First execution including JIT compilation took 0.23598504066467285s.
-[triton-dejavu] First execution including JIT compilation took 0.2227318286895752s.
-[triton-dejavu] First execution including JIT compilation took 0.3432927131652832s.
-[triton-dejavu] First execution including JIT compilation took 0.22769927978515625s.
-[triton-dejavu] First execution including JIT compilation took 0.20647501945495605s.
-[triton-dejavu] First execution including JIT compilation took 0.3485453128814697s.
-[triton-dejavu] First execution including JIT compilation took 0.2762014865875244s.
-[triton-dejavu] First execution including JIT compilation took 0.21726274490356445s.
-[triton-dejavu] First execution including JIT compilation took 0.32701706886291504s.
-[triton-dejavu] First execution including JIT compilation took 0.24490046501159668s.
-[triton-dejavu] First execution including JIT compilation took 0.2208249568939209s.
-[triton-dejavu] First execution including JIT compilation took 0.36136794090270996s.
-[triton-dejavu] First execution including JIT compilation took 0.3137195110321045s.
-[triton-dejavu] First execution including JIT compilation took 0.26834893226623535s.
-[triton-dejavu] First execution including JIT compilation took 0.32502317428588867s.
-[triton-dejavu] First execution including JIT compilation took 0.21649813652038574s.
-[triton-dejavu] First execution including JIT compilation took 0.18822789192199707s.
-[triton-dejavu] First execution including JIT compilation took 0.34781932830810547s.
-[triton-dejavu] First execution including JIT compilation took 0.25492358207702637s.
-[triton-dejavu] First execution including JIT compilation took 0.21149992942810059s.
-[triton-dejavu] First execution including JIT compilation took 0.41837024688720703s.
-[triton-dejavu] First execution including JIT compilation took 0.2709987163543701s.
-[triton-dejavu] First execution including JIT compilation took 0.22152233123779297s.
-[triton-dejavu] First execution including JIT compilation took 0.46758460998535156s.
-[triton-dejavu] First execution including JIT compilation took 0.2976984977722168s.
-[triton-dejavu] First execution including JIT compilation took 0.2336409091949463s.
-[triton-dejavu] First execution including JIT compilation took 0.42842841148376465s.
-[triton-dejavu] First execution including JIT compilation took 0.30059170722961426s.
-[triton-dejavu] First execution including JIT compilation took 0.25075721740722656s.
-[triton-dejavu] First execution including JIT compilation took 0.4862644672393799s.
-[triton-dejavu] First execution including JIT compilation took 0.32674360275268555s.
-[triton-dejavu] First execution including JIT compilation took 0.3176698684692383s.
-[triton-dejavu] First execution including JIT compilation took 0.6764540672302246s.
-[triton-dejavu] First execution including JIT compilation took 0.4595639705657959s.
-[triton-dejavu] First execution including JIT compilation took 0.3412759304046631s.
-[triton-dejavu] First execution including JIT compilation took 0.5369167327880859s.
-[triton-dejavu] First execution including JIT compilation took 0.3099100589752197s.
-[triton-dejavu] First execution including JIT compilation took 0.2513244152069092s.
-[triton-dejavu] First execution including JIT compilation took 0.683905839920044s.
-[triton-dejavu] First execution including JIT compilation took 0.3577401638031006s.
-[triton-dejavu] First execution including JIT compilation took 0.29708075523376465s.
-[triton-dejavu] First execution including JIT compilation took 0.8124041557312012s.
-[triton-dejavu] First execution including JIT compilation took 0.3909914493560791s.
-[triton-dejavu] First execution including JIT compilation took 0.32225966453552246s.
-[triton-dejavu] First execution including JIT compilation took 0.875910758972168s.
-[triton-dejavu] First execution including JIT compilation took 0.4234771728515625s.
-[triton-dejavu] First execution including JIT compilation took 0.3409273624420166s.
-[triton-dejavu] First execution including JIT compilation took 0.9268946647644043s.
-[triton-dejavu] First execution including JIT compilation took 0.46335840225219727s.
-[triton-dejavu] First execution including JIT compilation took 0.3612051010131836s.
-[triton-dejavu] First execution including JIT compilation took 0.9951462745666504s.
-[triton-dejavu] First execution including JIT compilation took 0.4895823001861572s.
-[triton-dejavu] First execution including JIT compilation took 0.38950228691101074s.
-[triton-dejavu] First execution including JIT compilation took 1.1150171756744385s.
-[triton-dejavu] First execution including JIT compilation took 0.5509529113769531s.
-[triton-dejavu] First execution including JIT compilation took 0.4379761219024658s.
-[triton-dejavu] First execution including JIT compilation took 0.9682984352111816s.
-[triton-dejavu] First execution including JIT compilation took 0.37152743339538574s.
-[triton-dejavu] First execution including JIT compilation took 0.25163698196411133s.
-[triton-dejavu] First execution including JIT compilation took 1.097111701965332s.
-[triton-dejavu] First execution including JIT compilation took 0.4002962112426758s.
-[triton-dejavu] First execution including JIT compilation took 0.29827260971069336s.
-[triton-dejavu] First execution including JIT compilation took 1.8650331497192383s.
-[triton-dejavu] First execution including JIT compilation took 0.5061264038085938s.
-[triton-dejavu] First execution including JIT compilation took 0.3558540344238281s.
-[triton-dejavu] First execution including JIT compilation took 1.9241220951080322s.
-[triton-dejavu] First execution including JIT compilation took 0.6082024574279785s.
-[triton-dejavu] First execution including JIT compilation took 0.35095739364624023s.
-[triton-dejavu] First execution including JIT compilation took 2.000699758529663s.
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 365568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.23331809043884277s.
-[triton-dejavu] First execution including JIT compilation took 0.2007770538330078s.
-[triton-dejavu] First execution including JIT compilation took 0.16995501518249512s.
-[triton-dejavu] First execution including JIT compilation took 0.2849550247192383s.
-[triton-dejavu] First execution including JIT compilation took 0.24909639358520508s.
-[triton-dejavu] First execution including JIT compilation took 0.20572757720947266s.
-[triton-dejavu] First execution including JIT compilation took 0.2925271987915039s.
-[triton-dejavu] First execution including JIT compilation took 0.24988293647766113s.
-[triton-dejavu] First execution including JIT compilation took 0.19756340980529785s.
-[triton-dejavu] First execution including JIT compilation took 0.308469295501709s.
-[triton-dejavu] First execution including JIT compilation took 0.25925660133361816s.
-[triton-dejavu] First execution including JIT compilation took 0.2018594741821289s.
-[triton-dejavu] First execution including JIT compilation took 0.31905412673950195s.
-[triton-dejavu] First execution including JIT compilation took 0.2588214874267578s.
-[triton-dejavu] First execution including JIT compilation took 0.22939348220825195s.
-[triton-dejavu] First execution including JIT compilation took 0.3315243721008301s.
-[triton-dejavu] First execution including JIT compilation took 0.2577550411224365s.
-[triton-dejavu] First execution including JIT compilation took 0.23265576362609863s.
-[triton-dejavu] First execution including JIT compilation took 0.3648536205291748s.
-[triton-dejavu] First execution including JIT compilation took 0.29656481742858887s.
-[triton-dejavu] First execution including JIT compilation took 0.24346494674682617s.
-[triton-dejavu] First execution including JIT compilation took 0.29159116744995117s.
-[triton-dejavu] First execution including JIT compilation took 0.20661616325378418s.
-[triton-dejavu] First execution including JIT compilation took 0.19298219680786133s.
-[triton-dejavu] First execution including JIT compilation took 0.3263542652130127s.
-[triton-dejavu] First execution including JIT compilation took 0.2544429302215576s.
-[triton-dejavu] First execution including JIT compilation took 0.24164104461669922s.
-[triton-dejavu] First execution including JIT compilation took 0.35983991622924805s.
-[triton-dejavu] First execution including JIT compilation took 0.27173733711242676s.
-[triton-dejavu] First execution including JIT compilation took 0.30269622802734375s.
-[triton-dejavu] First execution including JIT compilation took 0.3681807518005371s.
-[triton-dejavu] First execution including JIT compilation took 0.30908918380737305s.
-[triton-dejavu] First execution including JIT compilation took 0.21474623680114746s.
-[triton-dejavu] First execution including JIT compilation took 0.4122345447540283s.
-[triton-dejavu] First execution including JIT compilation took 0.29869675636291504s.
-[triton-dejavu] First execution including JIT compilation took 0.22951626777648926s.
-[triton-dejavu] First execution including JIT compilation took 0.4384334087371826s.
-[triton-dejavu] First execution including JIT compilation took 0.34481048583984375s.
-[triton-dejavu] First execution including JIT compilation took 0.23748016357421875s.
-[triton-dejavu] First execution including JIT compilation took 0.4472684860229492s.
-[triton-dejavu] First execution including JIT compilation took 0.3110086917877197s.
-[triton-dejavu] First execution including JIT compilation took 0.2900521755218506s.
-[triton-dejavu] First execution including JIT compilation took 0.3711414337158203s.
-[triton-dejavu] First execution including JIT compilation took 0.23607397079467773s.
-[triton-dejavu] First execution including JIT compilation took 0.264019250869751s.
-[triton-dejavu] First execution including JIT compilation took 0.7435603141784668s.
-[triton-dejavu] First execution including JIT compilation took 0.44277524948120117s.
-[triton-dejavu] First execution including JIT compilation took 0.21710801124572754s.
-[triton-dejavu] First execution including JIT compilation took 0.4168999195098877s.
-[triton-dejavu] First execution including JIT compilation took 0.3037564754486084s.
-[triton-dejavu] First execution including JIT compilation took 0.23413658142089844s.
-[triton-dejavu] First execution including JIT compilation took 0.5455996990203857s.
-[triton-dejavu] First execution including JIT compilation took 0.38571715354919434s.
-[triton-dejavu] First execution including JIT compilation took 0.31468629837036133s.
-[triton-dejavu] First execution including JIT compilation took 0.9226047992706299s.
-[triton-dejavu] First execution including JIT compilation took 0.5366237163543701s.
-[triton-dejavu] First execution including JIT compilation took 0.33862876892089844s.
-[triton-dejavu] First execution including JIT compilation took 0.7460176944732666s.
-[triton-dejavu] First execution including JIT compilation took 0.5355172157287598s.
-[triton-dejavu] First execution including JIT compilation took 0.3547065258026123s.
-[triton-dejavu] First execution including JIT compilation took 0.7944064140319824s.
-[triton-dejavu] First execution including JIT compilation took 0.5351183414459229s.
-[triton-dejavu] First execution including JIT compilation took 0.38912463188171387s.
-[triton-dejavu] First execution including JIT compilation took 0.6645946502685547s.
-[triton-dejavu] First execution including JIT compilation took 0.361285924911499s.
-[triton-dejavu] First execution including JIT compilation took 0.26433610916137695s.
-[triton-dejavu] First execution including JIT compilation took 0.7722549438476562s.
-[triton-dejavu] First execution including JIT compilation took 0.43912410736083984s.
-[triton-dejavu] First execution including JIT compilation took 0.34244585037231445s.
-[triton-dejavu] First execution including JIT compilation took 0.8954603672027588s.
-[triton-dejavu] First execution including JIT compilation took 0.4297215938568115s.
-[triton-dejavu] First execution including JIT compilation took 0.29612112045288086s.
-[triton-dejavu] First execution including JIT compilation took 0.8536269664764404s.
-[triton-dejavu] First execution including JIT compilation took 0.4335141181945801s.
-[triton-dejavu] First execution including JIT compilation took 0.3198516368865967s.
-[triton-dejavu] First execution including JIT compilation took 0.9171550273895264s.
-[triton-dejavu] First execution including JIT compilation took 0.49994635581970215s.
-[triton-dejavu] First execution including JIT compilation took 0.3632478713989258s.
-[triton-dejavu] First execution including JIT compilation took 0.9547479152679443s.
-[triton-dejavu] First execution including JIT compilation took 0.6412930488586426s.
-[triton-dejavu] First execution including JIT compilation took 0.38673996925354004s.
-bench_cudagraph failed with out of resource: shared memory, Required: 240128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 240128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 272896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 272896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 272896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 272896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.1171579360961914s.
-[triton-dejavu] First execution including JIT compilation took 0.4610905647277832s.
-[triton-dejavu] First execution including JIT compilation took 0.29224681854248047s.
-[triton-dejavu] First execution including JIT compilation took 1.3466551303863525s.
-[triton-dejavu] First execution including JIT compilation took 0.5738677978515625s.
-[triton-dejavu] First execution including JIT compilation took 0.39911484718322754s.
-[triton-dejavu] First execution including JIT compilation took 2.352712631225586s.
-[triton-dejavu] First execution including JIT compilation took 0.7405276298522949s.
-[triton-dejavu] First execution including JIT compilation took 0.3971683979034424s.
-[triton-dejavu] First execution including JIT compilation took 2.751913070678711s.
-bench_cudagraph failed with out of resource: shared memory, Required: 271360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 271360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 271360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 271360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 274432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 274432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 343040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 343040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 408576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 408576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 408576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 408576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 480256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 480256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 545792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 545792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 545792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 545792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.33484816551208496s.
-[triton-dejavu] First execution including JIT compilation took 0.2522711753845215s.
-[triton-dejavu] First execution including JIT compilation took 0.2160511016845703s.
-[triton-dejavu] First execution including JIT compilation took 0.39322805404663086s.
-[triton-dejavu] First execution including JIT compilation took 0.30945301055908203s.
-[triton-dejavu] First execution including JIT compilation took 0.23270702362060547s.
-[triton-dejavu] First execution including JIT compilation took 0.40409421920776367s.
-[triton-dejavu] First execution including JIT compilation took 0.29307126998901367s.
-[triton-dejavu] First execution including JIT compilation took 0.23789381980895996s.
-[triton-dejavu] First execution including JIT compilation took 0.4281790256500244s.
-[triton-dejavu] First execution including JIT compilation took 0.3268117904663086s.
-[triton-dejavu] First execution including JIT compilation took 0.23137664794921875s.
-[triton-dejavu] First execution including JIT compilation took 0.5715954303741455s.
-[triton-dejavu] First execution including JIT compilation took 0.34709930419921875s.
-[triton-dejavu] First execution including JIT compilation took 0.2881906032562256s.
-[triton-dejavu] First execution including JIT compilation took 0.5386977195739746s.
-[triton-dejavu] First execution including JIT compilation took 0.3279886245727539s.
-[triton-dejavu] First execution including JIT compilation took 0.2740349769592285s.
-[triton-dejavu] First execution including JIT compilation took 0.7362427711486816s.
-[triton-dejavu] First execution including JIT compilation took 0.3900623321533203s.
-[triton-dejavu] First execution including JIT compilation took 0.5564815998077393s.
-[triton-dejavu] First execution including JIT compilation took 0.43883204460144043s.
-[triton-dejavu] First execution including JIT compilation took 0.29198122024536133s.
-[triton-dejavu] First execution including JIT compilation took 0.22876620292663574s.
-[triton-dejavu] First execution including JIT compilation took 0.5440170764923096s.
-[triton-dejavu] First execution including JIT compilation took 0.31906890869140625s.
-[triton-dejavu] First execution including JIT compilation took 0.24542737007141113s.
-[triton-dejavu] First execution including JIT compilation took 0.5503432750701904s.
-[triton-dejavu] First execution including JIT compilation took 0.5911548137664795s.
-[triton-dejavu] First execution including JIT compilation took 0.2854585647583008s.
-[triton-dejavu] First execution including JIT compilation took 0.5509939193725586s.
-[triton-dejavu] First execution including JIT compilation took 0.35714101791381836s.
-[triton-dejavu] First execution including JIT compilation took 0.32631993293762207s.
-[triton-dejavu] First execution including JIT compilation took 0.7352540493011475s.
-[triton-dejavu] First execution including JIT compilation took 0.4919905662536621s.
-[triton-dejavu] First execution including JIT compilation took 0.3511998653411865s.
-[triton-dejavu] First execution including JIT compilation took 0.869816780090332s.
-[triton-dejavu] First execution including JIT compilation took 0.6908586025238037s.
-[triton-dejavu] First execution including JIT compilation took 0.3677358627319336s.
-[triton-dejavu] First execution including JIT compilation took 1.0730576515197754s.
-[triton-dejavu] First execution including JIT compilation took 0.5986707210540771s.
-[triton-dejavu] First execution including JIT compilation took 0.40021514892578125s.
-[triton-dejavu] First execution including JIT compilation took 0.7205333709716797s.
-[triton-dejavu] First execution including JIT compilation took 0.4558281898498535s.
-[triton-dejavu] First execution including JIT compilation took 0.3011903762817383s.
-[triton-dejavu] First execution including JIT compilation took 0.7565047740936279s.
-[triton-dejavu] First execution including JIT compilation took 0.5372674465179443s.
-[triton-dejavu] First execution including JIT compilation took 0.35286855697631836s.
-[triton-dejavu] First execution including JIT compilation took 0.778130292892456s.
-[triton-dejavu] First execution including JIT compilation took 0.48418164253234863s.
-[triton-dejavu] First execution including JIT compilation took 0.40369200706481934s.
-[triton-dejavu] First execution including JIT compilation took 0.8576066493988037s.
-[triton-dejavu] First execution including JIT compilation took 0.6475625038146973s.
-[triton-dejavu] First execution including JIT compilation took 0.3454623222351074s.
-[triton-dejavu] First execution including JIT compilation took 0.8133499622344971s.
-[triton-dejavu] First execution including JIT compilation took 0.6029653549194336s.
-[triton-dejavu] First execution including JIT compilation took 0.36894989013671875s.
-[triton-dejavu] First execution including JIT compilation took 0.8726181983947754s.
-[triton-dejavu] First execution including JIT compilation took 0.5887446403503418s.
-[triton-dejavu] First execution including JIT compilation took 0.38448596000671387s.
-[triton-dejavu] First execution including JIT compilation took 1.022134780883789s.
-[triton-dejavu] First execution including JIT compilation took 0.67586350440979s.
-[triton-dejavu] First execution including JIT compilation took 0.4078867435455322s.
-[triton-dejavu] First execution including JIT compilation took 1.0992224216461182s.
-[triton-dejavu] First execution including JIT compilation took 0.5207531452178955s.
-[triton-dejavu] First execution including JIT compilation took 0.303997278213501s.
-[triton-dejavu] First execution including JIT compilation took 1.1604199409484863s.
-[triton-dejavu] First execution including JIT compilation took 0.5801262855529785s.
-[triton-dejavu] First execution including JIT compilation took 0.413867712020874s.
-[triton-dejavu] First execution including JIT compilation took 1.658043384552002s.
-[triton-dejavu] First execution including JIT compilation took 0.6655893325805664s.
-[triton-dejavu] First execution including JIT compilation took 0.4302644729614258s.
-[triton-dejavu] First execution including JIT compilation took 1.9427084922790527s.
-[triton-dejavu] First execution including JIT compilation took 0.8315591812133789s.
-[triton-dejavu] First execution including JIT compilation took 0.5458030700683594s.
-[triton-dejavu] First execution including JIT compilation took 1.938206672668457s.
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.1561598777770996s.
-[triton-dejavu] First execution including JIT compilation took 1.0098638534545898s.
-[triton-dejavu] First execution including JIT compilation took 0.4696693420410156s.
-[triton-dejavu] First execution including JIT compilation took 2.832549571990967s.
-[triton-dejavu] First execution including JIT compilation took 1.2741048336029053s.
-[triton-dejavu] First execution including JIT compilation took 0.5360772609710693s.
-[triton-dejavu] First execution including JIT compilation took 7.192165851593018s.
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 405504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 405504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 709632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 709632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.5908124446868896s.
-[triton-dejavu] First execution including JIT compilation took 0.41149401664733887s.
-[triton-dejavu] First execution including JIT compilation took 0.2559213638305664s.
-[triton-dejavu] First execution including JIT compilation took 0.7076609134674072s.
-[triton-dejavu] First execution including JIT compilation took 0.5295822620391846s.
-[triton-dejavu] First execution including JIT compilation took 0.31011199951171875s.
-[triton-dejavu] First execution including JIT compilation took 0.7750310897827148s.
-[triton-dejavu] First execution including JIT compilation took 0.4493274688720703s.
-[triton-dejavu] First execution including JIT compilation took 0.30690884590148926s.
-[triton-dejavu] First execution including JIT compilation took 0.7551653385162354s.
-[triton-dejavu] First execution including JIT compilation took 0.46668338775634766s.
-[triton-dejavu] First execution including JIT compilation took 0.30584287643432617s.
-[triton-dejavu] First execution including JIT compilation took 0.7725615501403809s.
-[triton-dejavu] First execution including JIT compilation took 0.482954740524292s.
-[triton-dejavu] First execution including JIT compilation took 0.3182220458984375s.
-[triton-dejavu] First execution including JIT compilation took 0.9150772094726562s.
-[triton-dejavu] First execution including JIT compilation took 0.5212767124176025s.
-[triton-dejavu] First execution including JIT compilation took 0.3300950527191162s.
-[triton-dejavu] First execution including JIT compilation took 1.053274393081665s.
-[triton-dejavu] First execution including JIT compilation took 0.5630724430084229s.
-[triton-dejavu] First execution including JIT compilation took 0.3730814456939697s.
-[triton-dejavu] First execution including JIT compilation took 0.79178786277771s.
-[triton-dejavu] First execution including JIT compilation took 0.46175098419189453s.
-[triton-dejavu] First execution including JIT compilation took 0.28571319580078125s.
-[triton-dejavu] First execution including JIT compilation took 0.95066237449646s.
-[triton-dejavu] First execution including JIT compilation took 0.6377534866333008s.
-[triton-dejavu] First execution including JIT compilation took 0.35297322273254395s.
-[triton-dejavu] First execution including JIT compilation took 1.0828590393066406s.
-[triton-dejavu] First execution including JIT compilation took 0.6473112106323242s.
-[triton-dejavu] First execution including JIT compilation took 0.3665587902069092s.
-[triton-dejavu] First execution including JIT compilation took 1.2117927074432373s.
-[triton-dejavu] First execution including JIT compilation took 0.6923372745513916s.
-[triton-dejavu] First execution including JIT compilation took 0.39316701889038086s.
-[triton-dejavu] First execution including JIT compilation took 1.3109822273254395s.
-[triton-dejavu] First execution including JIT compilation took 0.6752035617828369s.
-[triton-dejavu] First execution including JIT compilation took 0.39464235305786133s.
-[triton-dejavu] First execution including JIT compilation took 1.4121100902557373s.
-[triton-dejavu] First execution including JIT compilation took 0.7038490772247314s.
-[triton-dejavu] First execution including JIT compilation took 0.46832728385925293s.
-[triton-dejavu] First execution including JIT compilation took 1.6874134540557861s.
-[triton-dejavu] First execution including JIT compilation took 0.8371837139129639s.
-[triton-dejavu] First execution including JIT compilation took 0.46314477920532227s.
-[triton-dejavu] First execution including JIT compilation took 1.2351336479187012s.
-[triton-dejavu] First execution including JIT compilation took 0.732062816619873s.
-[triton-dejavu] First execution including JIT compilation took 0.3664720058441162s.
-[triton-dejavu] First execution including JIT compilation took 1.3951785564422607s.
-[triton-dejavu] First execution including JIT compilation took 0.8160102367401123s.
-[triton-dejavu] First execution including JIT compilation took 0.4101998805999756s.
-[triton-dejavu] First execution including JIT compilation took 1.886866569519043s.
-[triton-dejavu] First execution including JIT compilation took 0.9539880752563477s.
-[triton-dejavu] First execution including JIT compilation took 0.4695587158203125s.
-[triton-dejavu] First execution including JIT compilation took 2.1268863677978516s.
-[triton-dejavu] First execution including JIT compilation took 1.127213716506958s.
-[triton-dejavu] First execution including JIT compilation took 0.5213239192962646s.
-[triton-dejavu] First execution including JIT compilation took 2.084219217300415s.
-[triton-dejavu] First execution including JIT compilation took 1.8083431720733643s.
-[triton-dejavu] First execution including JIT compilation took 0.7044923305511475s.
-[triton-dejavu] First execution including JIT compilation took 2.561204195022583s.
-bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 292096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 292096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.7049872875213623s.
-[triton-dejavu] First execution including JIT compilation took 1.3836045265197754s.
-[triton-dejavu] First execution including JIT compilation took 0.6024000644683838s.
-[triton-dejavu] First execution including JIT compilation took 3.13523006439209s.
-[triton-dejavu] First execution including JIT compilation took 2.1513431072235107s.
-[triton-dejavu] First execution including JIT compilation took 0.8262593746185303s.
-[triton-dejavu] First execution including JIT compilation took 7.682474613189697s.
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 584192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 584192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 7.4601991176605225s.
-[triton-dejavu] First execution including JIT compilation took 4.111113786697388s.
-[triton-dejavu] First execution including JIT compilation took 0.9548518657684326s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 500736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 500736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 834560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 834560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1168384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1168384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.22535085678100586s.
-[triton-dejavu] First execution including JIT compilation took 0.2475295066833496s.
-[triton-dejavu] First execution including JIT compilation took 0.16012358665466309s.
-[triton-dejavu] First execution including JIT compilation took 0.22483301162719727s.
-[triton-dejavu] First execution including JIT compilation took 0.21385669708251953s.
-[triton-dejavu] First execution including JIT compilation took 0.19730210304260254s.
-[triton-dejavu] First execution including JIT compilation took 0.2612948417663574s.
-[triton-dejavu] First execution including JIT compilation took 0.2422807216644287s.
-[triton-dejavu] First execution including JIT compilation took 0.2098245620727539s.
-[triton-dejavu] First execution including JIT compilation took 0.27469491958618164s.
-[triton-dejavu] First execution including JIT compilation took 0.23059582710266113s.
-[triton-dejavu] First execution including JIT compilation took 0.20798087120056152s.
-[triton-dejavu] First execution including JIT compilation took 0.27477025985717773s.
-[triton-dejavu] First execution including JIT compilation took 0.22785329818725586s.
-[triton-dejavu] First execution including JIT compilation took 0.22185730934143066s.
-[triton-dejavu] First execution including JIT compilation took 0.28423380851745605s.
-[triton-dejavu] First execution including JIT compilation took 0.2566359043121338s.
-[triton-dejavu] First execution including JIT compilation took 0.22713565826416016s.
-[triton-dejavu] First execution including JIT compilation took 0.3106961250305176s.
-[triton-dejavu] First execution including JIT compilation took 0.27785158157348633s.
-[triton-dejavu] First execution including JIT compilation took 0.24365854263305664s.
-[triton-dejavu] First execution including JIT compilation took 0.26142382621765137s.
-[triton-dejavu] First execution including JIT compilation took 0.19252419471740723s.
-[triton-dejavu] First execution including JIT compilation took 0.1860034465789795s.
-[triton-dejavu] First execution including JIT compilation took 0.2948489189147949s.
-[triton-dejavu] First execution including JIT compilation took 0.22476696968078613s.
-[triton-dejavu] First execution including JIT compilation took 0.26130104064941406s.
-[triton-dejavu] First execution including JIT compilation took 0.40126538276672363s.
-[triton-dejavu] First execution including JIT compilation took 0.29898667335510254s.
-[triton-dejavu] First execution including JIT compilation took 0.27327704429626465s.
-[triton-dejavu] First execution including JIT compilation took 0.41613197326660156s.
-[triton-dejavu] First execution including JIT compilation took 0.30518221855163574s.
-[triton-dejavu] First execution including JIT compilation took 0.28211188316345215s.
-[triton-dejavu] First execution including JIT compilation took 0.4438753128051758s.
-[triton-dejavu] First execution including JIT compilation took 0.3301517963409424s.
-[triton-dejavu] First execution including JIT compilation took 0.29430127143859863s.
-[triton-dejavu] First execution including JIT compilation took 0.4660637378692627s.
-[triton-dejavu] First execution including JIT compilation took 0.3478364944458008s.
-[triton-dejavu] First execution including JIT compilation took 0.30316758155822754s.
-[triton-dejavu] First execution including JIT compilation took 0.5299293994903564s.
-[triton-dejavu] First execution including JIT compilation took 0.3778262138366699s.
-[triton-dejavu] First execution including JIT compilation took 0.33774375915527344s.
-[triton-dejavu] First execution including JIT compilation took 0.4039268493652344s.
-[triton-dejavu] First execution including JIT compilation took 0.26682138442993164s.
-[triton-dejavu] First execution including JIT compilation took 0.22738170623779297s.
-[triton-dejavu] First execution including JIT compilation took 0.47388482093811035s.
-[triton-dejavu] First execution including JIT compilation took 0.3087284564971924s.
-[triton-dejavu] First execution including JIT compilation took 0.2632722854614258s.
-[triton-dejavu] First execution including JIT compilation took 0.5885961055755615s.
-[triton-dejavu] First execution including JIT compilation took 0.34126925468444824s.
-[triton-dejavu] First execution including JIT compilation took 0.29026103019714355s.
-[triton-dejavu] First execution including JIT compilation took 0.653205394744873s.
-[triton-dejavu] First execution including JIT compilation took 0.37291741371154785s.
-[triton-dejavu] First execution including JIT compilation took 0.30518603324890137s.
-[triton-dejavu] First execution including JIT compilation took 0.7140650749206543s.
-[triton-dejavu] First execution including JIT compilation took 0.4112815856933594s.
-[triton-dejavu] First execution including JIT compilation took 0.3285210132598877s.
-[triton-dejavu] First execution including JIT compilation took 0.7528486251831055s.
-[triton-dejavu] First execution including JIT compilation took 0.4412572383880615s.
-[triton-dejavu] First execution including JIT compilation took 0.3455331325531006s.
-[triton-dejavu] First execution including JIT compilation took 0.8772494792938232s.
-[triton-dejavu] First execution including JIT compilation took 0.4721558094024658s.
-[triton-dejavu] First execution including JIT compilation took 0.3847846984863281s.
-[triton-dejavu] First execution including JIT compilation took 0.6612024307250977s.
-[triton-dejavu] First execution including JIT compilation took 0.3503909111022949s.
-[triton-dejavu] First execution including JIT compilation took 0.25101804733276367s.
-[triton-dejavu] First execution including JIT compilation took 0.8109943866729736s.
-[triton-dejavu] First execution including JIT compilation took 0.38869762420654297s.
-[triton-dejavu] First execution including JIT compilation took 0.2938868999481201s.
-[triton-dejavu] First execution including JIT compilation took 1.3092761039733887s.
-[triton-dejavu] First execution including JIT compilation took 0.49189066886901855s.
-[triton-dejavu] First execution including JIT compilation took 0.3337218761444092s.
-[triton-dejavu] First execution including JIT compilation took 1.4332802295684814s.
-[triton-dejavu] First execution including JIT compilation took 0.509821891784668s.
-[triton-dejavu] First execution including JIT compilation took 0.3853490352630615s.
-[triton-dejavu] First execution including JIT compilation took 1.5309038162231445s.
-[triton-dejavu] First execution including JIT compilation took 0.5586769580841064s.
-[triton-dejavu] First execution including JIT compilation took 0.388033390045166s.
-[triton-dejavu] First execution including JIT compilation took 1.588334083557129s.
-[triton-dejavu] First execution including JIT compilation took 0.49126362800598145s.
-[triton-dejavu] First execution including JIT compilation took 0.34864377975463867s.
-bench_cudagraph failed with out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 305664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 305664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 305664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 305664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.4631285667419434s.
-[triton-dejavu] First execution including JIT compilation took 0.4619269371032715s.
-[triton-dejavu] First execution including JIT compilation took 0.29707860946655273s.
-[triton-dejavu] First execution including JIT compilation took 1.3915040493011475s.
-[triton-dejavu] First execution including JIT compilation took 0.5160026550292969s.
-[triton-dejavu] First execution including JIT compilation took 0.3659961223602295s.
-[triton-dejavu] First execution including JIT compilation took 5.161684989929199s.
-[triton-dejavu] First execution including JIT compilation took 1.0375711917877197s.
-[triton-dejavu] First execution including JIT compilation took 0.3804037570953369s.
-[triton-dejavu] First execution including JIT compilation took 5.337275505065918s.
-bench_cudagraph failed with out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 457728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 457728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 457728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 457728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 537600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 611328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 611328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 611328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 611328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.31734514236450195s.
-[triton-dejavu] First execution including JIT compilation took 0.2059774398803711s.
-[triton-dejavu] First execution including JIT compilation took 0.1928870677947998s.
-[triton-dejavu] First execution including JIT compilation took 0.33611416816711426s.
-[triton-dejavu] First execution including JIT compilation took 0.23422932624816895s.
-[triton-dejavu] First execution including JIT compilation took 0.21037578582763672s.
-[triton-dejavu] First execution including JIT compilation took 0.289534330368042s.
-[triton-dejavu] First execution including JIT compilation took 0.24709415435791016s.
-[triton-dejavu] First execution including JIT compilation took 0.2226250171661377s.
-[triton-dejavu] First execution including JIT compilation took 0.3181033134460449s.
-[triton-dejavu] First execution including JIT compilation took 0.2734520435333252s.
-[triton-dejavu] First execution including JIT compilation took 0.2189197540283203s.
-[triton-dejavu] First execution including JIT compilation took 0.3356783390045166s.
-[triton-dejavu] First execution including JIT compilation took 0.2723813056945801s.
-[triton-dejavu] First execution including JIT compilation took 0.2521681785583496s.
-[triton-dejavu] First execution including JIT compilation took 0.32643723487854004s.
-[triton-dejavu] First execution including JIT compilation took 0.278456449508667s.
-[triton-dejavu] First execution including JIT compilation took 0.25096702575683594s.
-[triton-dejavu] First execution including JIT compilation took 0.35304760932922363s.
-[triton-dejavu] First execution including JIT compilation took 0.29747843742370605s.
-[triton-dejavu] First execution including JIT compilation took 0.2503805160522461s.
-[triton-dejavu] First execution including JIT compilation took 0.3057088851928711s.
-[triton-dejavu] First execution including JIT compilation took 0.2160186767578125s.
-[triton-dejavu] First execution including JIT compilation took 0.1883528232574463s.
-[triton-dejavu] First execution including JIT compilation took 0.3313779830932617s.
-[triton-dejavu] First execution including JIT compilation took 0.24627685546875s.
-[triton-dejavu] First execution including JIT compilation took 0.201185941696167s.
-[triton-dejavu] First execution including JIT compilation took 0.3443264961242676s.
-[triton-dejavu] First execution including JIT compilation took 0.2596099376678467s.
-[triton-dejavu] First execution including JIT compilation took 0.23357057571411133s.
-[triton-dejavu] First execution including JIT compilation took 0.42798876762390137s.
-[triton-dejavu] First execution including JIT compilation took 0.30511474609375s.
-[triton-dejavu] First execution including JIT compilation took 0.24922823905944824s.
-[triton-dejavu] First execution including JIT compilation took 0.4275035858154297s.
-[triton-dejavu] First execution including JIT compilation took 0.3170912265777588s.
-[triton-dejavu] First execution including JIT compilation took 0.25102734565734863s.
-[triton-dejavu] First execution including JIT compilation took 0.4548606872558594s.
-[triton-dejavu] First execution including JIT compilation took 0.2932870388031006s.
-[triton-dejavu] First execution including JIT compilation took 0.25251173973083496s.
-[triton-dejavu] First execution including JIT compilation took 0.5132782459259033s.
-[triton-dejavu] First execution including JIT compilation took 0.3854689598083496s.
-[triton-dejavu] First execution including JIT compilation took 0.276400089263916s.
-[triton-dejavu] First execution including JIT compilation took 0.3926353454589844s.
-[triton-dejavu] First execution including JIT compilation took 0.24996232986450195s.
-[triton-dejavu] First execution including JIT compilation took 0.21382498741149902s.
-[triton-dejavu] First execution including JIT compilation took 0.4578080177307129s.
-[triton-dejavu] First execution including JIT compilation took 0.29611897468566895s.
-[triton-dejavu] First execution including JIT compilation took 0.2173306941986084s.
-[triton-dejavu] First execution including JIT compilation took 0.548072099685669s.
-[triton-dejavu] First execution including JIT compilation took 0.33872079849243164s.
-[triton-dejavu] First execution including JIT compilation took 0.23550057411193848s.
-[triton-dejavu] First execution including JIT compilation took 0.5951023101806641s.
-[triton-dejavu] First execution including JIT compilation took 0.349484920501709s.
-[triton-dejavu] First execution including JIT compilation took 0.25032520294189453s.
-[triton-dejavu] First execution including JIT compilation took 0.6920459270477295s.
-[triton-dejavu] First execution including JIT compilation took 0.3887183666229248s.
-[triton-dejavu] First execution including JIT compilation took 0.27884531021118164s.
-[triton-dejavu] First execution including JIT compilation took 0.6650998592376709s.
-[triton-dejavu] First execution including JIT compilation took 0.38024473190307617s.
-[triton-dejavu] First execution including JIT compilation took 0.293820858001709s.
-[triton-dejavu] First execution including JIT compilation took 0.7408139705657959s.
-[triton-dejavu] First execution including JIT compilation took 0.45377397537231445s.
-[triton-dejavu] First execution including JIT compilation took 0.3186800479888916s.
-[triton-dejavu] First execution including JIT compilation took 0.7443890571594238s.
-[triton-dejavu] First execution including JIT compilation took 0.3219418525695801s.
-[triton-dejavu] First execution including JIT compilation took 0.23611903190612793s.
-[triton-dejavu] First execution including JIT compilation took 0.7835826873779297s.
-[triton-dejavu] First execution including JIT compilation took 0.3625168800354004s.
-[triton-dejavu] First execution including JIT compilation took 0.2934072017669678s.
-[triton-dejavu] First execution including JIT compilation took 1.3723728656768799s.
-[triton-dejavu] First execution including JIT compilation took 0.5136263370513916s.
-[triton-dejavu] First execution including JIT compilation took 0.3561995029449463s.
-[triton-dejavu] First execution including JIT compilation took 1.3893115520477295s.
-[triton-dejavu] First execution including JIT compilation took 0.4639883041381836s.
-[triton-dejavu] First execution including JIT compilation took 0.32212138175964355s.
-[triton-dejavu] First execution including JIT compilation took 1.5799453258514404s.
-[triton-dejavu] First execution including JIT compilation took 0.519599199295044s.
-[triton-dejavu] First execution including JIT compilation took 0.34169602394104004s.
-[triton-dejavu] First execution including JIT compilation took 1.520521640777588s.
-bench_cudagraph failed with out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 338432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 338432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 338432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 338432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.3502428531646729s.
-[triton-dejavu] First execution including JIT compilation took 0.5446245670318604s.
-[triton-dejavu] First execution including JIT compilation took 0.3224365711212158s.
-[triton-dejavu] First execution including JIT compilation took 1.8596394062042236s.
-[triton-dejavu] First execution including JIT compilation took 0.6688938140869141s.
-[triton-dejavu] First execution including JIT compilation took 0.37556910514831543s.
-[triton-dejavu] First execution including JIT compilation took 5.96744441986084s.
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 254976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 336896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 336896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 336896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 336896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 424960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 676864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 676864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 676864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 676864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.3655416965484619s.
-[triton-dejavu] First execution including JIT compilation took 0.23999977111816406s.
-[triton-dejavu] First execution including JIT compilation took 0.19980168342590332s.
-[triton-dejavu] First execution including JIT compilation took 0.3963167667388916s.
-[triton-dejavu] First execution including JIT compilation took 0.29250240325927734s.
-[triton-dejavu] First execution including JIT compilation took 0.23019075393676758s.
-[triton-dejavu] First execution including JIT compilation took 0.3897242546081543s.
-[triton-dejavu] First execution including JIT compilation took 0.26331114768981934s.
-[triton-dejavu] First execution including JIT compilation took 0.25720906257629395s.
-[triton-dejavu] First execution including JIT compilation took 0.5272367000579834s.
-[triton-dejavu] First execution including JIT compilation took 0.3620729446411133s.
-[triton-dejavu] First execution including JIT compilation took 0.30249667167663574s.
-[triton-dejavu] First execution including JIT compilation took 0.5528614521026611s.
-[triton-dejavu] First execution including JIT compilation took 0.38202786445617676s.
-[triton-dejavu] First execution including JIT compilation took 0.32003331184387207s.
-[triton-dejavu] First execution including JIT compilation took 0.5820906162261963s.
-[triton-dejavu] First execution including JIT compilation took 0.3516204357147217s.
-[triton-dejavu] First execution including JIT compilation took 0.2539525032043457s.
-[triton-dejavu] First execution including JIT compilation took 0.5132083892822266s.
-[triton-dejavu] First execution including JIT compilation took 0.3485991954803467s.
-[triton-dejavu] First execution including JIT compilation took 0.26761674880981445s.
-[triton-dejavu] First execution including JIT compilation took 0.3998754024505615s.
-[triton-dejavu] First execution including JIT compilation took 0.270932674407959s.
-[triton-dejavu] First execution including JIT compilation took 0.21268010139465332s.
-[triton-dejavu] First execution including JIT compilation took 0.4931457042694092s.
-[triton-dejavu] First execution including JIT compilation took 0.3084697723388672s.
-[triton-dejavu] First execution including JIT compilation took 0.22578716278076172s.
-[triton-dejavu] First execution including JIT compilation took 0.4800398349761963s.
-[triton-dejavu] First execution including JIT compilation took 0.3248765468597412s.
-[triton-dejavu] First execution including JIT compilation took 0.25438714027404785s.
-[triton-dejavu] First execution including JIT compilation took 0.5268030166625977s.
-[triton-dejavu] First execution including JIT compilation took 0.32793354988098145s.
-[triton-dejavu] First execution including JIT compilation took 0.2654423713684082s.
-[triton-dejavu] First execution including JIT compilation took 0.5680561065673828s.
-[triton-dejavu] First execution including JIT compilation took 0.3322784900665283s.
-[triton-dejavu] First execution including JIT compilation took 0.25258684158325195s.
-[triton-dejavu] First execution including JIT compilation took 0.5792534351348877s.
-[triton-dejavu] First execution including JIT compilation took 0.5247256755828857s.
-[triton-dejavu] First execution including JIT compilation took 0.3439359664916992s.
-[triton-dejavu] First execution including JIT compilation took 0.8489353656768799s.
-[triton-dejavu] First execution including JIT compilation took 0.5044565200805664s.
-[triton-dejavu] First execution including JIT compilation took 0.39157629013061523s.
-[triton-dejavu] First execution including JIT compilation took 0.733513355255127s.
-[triton-dejavu] First execution including JIT compilation took 0.38277316093444824s.
-[triton-dejavu] First execution including JIT compilation took 0.2873697280883789s.
-[triton-dejavu] First execution including JIT compilation took 0.8169002532958984s.
-[triton-dejavu] First execution including JIT compilation took 0.3655128479003906s.
-[triton-dejavu] First execution including JIT compilation took 0.26145172119140625s.
-[triton-dejavu] First execution including JIT compilation took 0.8048985004425049s.
-[triton-dejavu] First execution including JIT compilation took 0.40337085723876953s.
-[triton-dejavu] First execution including JIT compilation took 0.2873227596282959s.
-[triton-dejavu] First execution including JIT compilation took 0.7874279022216797s.
-[triton-dejavu] First execution including JIT compilation took 0.4543600082397461s.
-[triton-dejavu] First execution including JIT compilation took 0.30629849433898926s.
-[triton-dejavu] First execution including JIT compilation took 1.1004579067230225s.
-[triton-dejavu] First execution including JIT compilation took 0.595219612121582s.
-[triton-dejavu] First execution including JIT compilation took 0.4115121364593506s.
-[triton-dejavu] First execution including JIT compilation took 1.1447741985321045s.
-[triton-dejavu] First execution including JIT compilation took 0.6449964046478271s.
-[triton-dejavu] First execution including JIT compilation took 0.42902207374572754s.
-[triton-dejavu] First execution including JIT compilation took 1.485217809677124s.
-[triton-dejavu] First execution including JIT compilation took 0.7232568264007568s.
-[triton-dejavu] First execution including JIT compilation took 0.478473424911499s.
-[triton-dejavu] First execution including JIT compilation took 1.249863862991333s.
-[triton-dejavu] First execution including JIT compilation took 0.5245099067687988s.
-[triton-dejavu] First execution including JIT compilation took 0.3504352569580078s.
-[triton-dejavu] First execution including JIT compilation took 1.5189287662506104s.
-[triton-dejavu] First execution including JIT compilation took 0.5968093872070312s.
-[triton-dejavu] First execution including JIT compilation took 0.4099123477935791s.
-[triton-dejavu] First execution including JIT compilation took 2.0371575355529785s.
-[triton-dejavu] First execution including JIT compilation took 0.5784683227539062s.
-[triton-dejavu] First execution including JIT compilation took 0.3639485836029053s.
-[triton-dejavu] First execution including JIT compilation took 2.0350635051727295s.
-[triton-dejavu] First execution including JIT compilation took 0.7109920978546143s.
-[triton-dejavu] First execution including JIT compilation took 0.4702615737915039s.
-[triton-dejavu] First execution including JIT compilation took 1.9885196685791016s.
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.343762159347534s.
-[triton-dejavu] First execution including JIT compilation took 0.7591912746429443s.
-[triton-dejavu] First execution including JIT compilation took 0.41748905181884766s.
-[triton-dejavu] First execution including JIT compilation took 2.6733522415161133s.
-[triton-dejavu] First execution including JIT compilation took 0.8401713371276855s.
-[triton-dejavu] First execution including JIT compilation took 0.44391798973083496s.
-[triton-dejavu] First execution including JIT compilation took 8.486325740814209s.
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 304128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 405504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 405504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 503808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 506880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 605184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 709632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 709632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.6625301837921143s.
-[triton-dejavu] First execution including JIT compilation took 0.4150726795196533s.
-[triton-dejavu] First execution including JIT compilation took 0.2472681999206543s.
-[triton-dejavu] First execution including JIT compilation took 0.6982488632202148s.
-[triton-dejavu] First execution including JIT compilation took 0.38659119606018066s.
-[triton-dejavu] First execution including JIT compilation took 0.2670154571533203s.
-[triton-dejavu] First execution including JIT compilation took 0.6746160984039307s.
-[triton-dejavu] First execution including JIT compilation took 0.38513994216918945s.
-[triton-dejavu] First execution including JIT compilation took 0.2773430347442627s.
-[triton-dejavu] First execution including JIT compilation took 0.7035007476806641s.
-[triton-dejavu] First execution including JIT compilation took 0.43756604194641113s.
-[triton-dejavu] First execution including JIT compilation took 0.27396464347839355s.
-[triton-dejavu] First execution including JIT compilation took 0.8079738616943359s.
-[triton-dejavu] First execution including JIT compilation took 0.4706554412841797s.
-[triton-dejavu] First execution including JIT compilation took 0.3690028190612793s.
-[triton-dejavu] First execution including JIT compilation took 1.0047783851623535s.
-[triton-dejavu] First execution including JIT compilation took 0.4361457824707031s.
-[triton-dejavu] First execution including JIT compilation took 0.364422082901001s.
-[triton-dejavu] First execution including JIT compilation took 0.8165838718414307s.
-[triton-dejavu] First execution including JIT compilation took 0.48160290718078613s.
-[triton-dejavu] First execution including JIT compilation took 0.3796987533569336s.
-[triton-dejavu] First execution including JIT compilation took 0.8447437286376953s.
-[triton-dejavu] First execution including JIT compilation took 0.3994133472442627s.
-[triton-dejavu] First execution including JIT compilation took 0.3070847988128662s.
-[triton-dejavu] First execution including JIT compilation took 0.8190915584564209s.
-[triton-dejavu] First execution including JIT compilation took 0.44321155548095703s.
-[triton-dejavu] First execution including JIT compilation took 0.30388951301574707s.
-[triton-dejavu] First execution including JIT compilation took 0.8760182857513428s.
-[triton-dejavu] First execution including JIT compilation took 0.498699426651001s.
-[triton-dejavu] First execution including JIT compilation took 0.33666563034057617s.
-[triton-dejavu] First execution including JIT compilation took 0.9385683536529541s.
-[triton-dejavu] First execution including JIT compilation took 0.49439096450805664s.
-[triton-dejavu] First execution including JIT compilation took 0.34059906005859375s.
-[triton-dejavu] First execution including JIT compilation took 0.9848973751068115s.
-[triton-dejavu] First execution including JIT compilation took 0.5026867389678955s.
-[triton-dejavu] First execution including JIT compilation took 0.3362538814544678s.
-[triton-dejavu] First execution including JIT compilation took 1.054696798324585s.
-[triton-dejavu] First execution including JIT compilation took 0.5362629890441895s.
-[triton-dejavu] First execution including JIT compilation took 0.3406381607055664s.
-[triton-dejavu] First execution including JIT compilation took 1.1980383396148682s.
-[triton-dejavu] First execution including JIT compilation took 0.6002733707427979s.
-[triton-dejavu] First execution including JIT compilation took 0.4032762050628662s.
-[triton-dejavu] First execution including JIT compilation took 1.0670430660247803s.
-[triton-dejavu] First execution including JIT compilation took 0.5054290294647217s.
-[triton-dejavu] First execution including JIT compilation took 0.31844234466552734s.
-[triton-dejavu] First execution including JIT compilation took 1.20845365524292s.
-[triton-dejavu] First execution including JIT compilation took 0.635533332824707s.
-[triton-dejavu] First execution including JIT compilation took 0.39752840995788574s.
-[triton-dejavu] First execution including JIT compilation took 1.2634165287017822s.
-[triton-dejavu] First execution including JIT compilation took 0.6931250095367432s.
-[triton-dejavu] First execution including JIT compilation took 0.3806438446044922s.
-[triton-dejavu] First execution including JIT compilation took 1.3524491786956787s.
-[triton-dejavu] First execution including JIT compilation took 0.6660432815551758s.
-[triton-dejavu] First execution including JIT compilation took 0.4266016483306885s.
-[triton-dejavu] First execution including JIT compilation took 1.3512389659881592s.
-[triton-dejavu] First execution including JIT compilation took 0.7300617694854736s.
-[triton-dejavu] First execution including JIT compilation took 0.4240868091583252s.
-[triton-dejavu] First execution including JIT compilation took 1.5339932441711426s.
-[triton-dejavu] First execution including JIT compilation took 0.7730631828308105s.
-[triton-dejavu] First execution including JIT compilation took 0.506572961807251s.
-bench_cudagraph failed with out of resource: shared memory, Required: 234752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 234752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 267520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 267520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 267520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 267520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.9073479175567627s.
-[triton-dejavu] First execution including JIT compilation took 0.7516419887542725s.
-[triton-dejavu] First execution including JIT compilation took 0.49443531036376953s.
-[triton-dejavu] First execution including JIT compilation took 2.24280047416687s.
-[triton-dejavu] First execution including JIT compilation took 0.8264782428741455s.
-[triton-dejavu] First execution including JIT compilation took 0.49369287490844727s.
-[triton-dejavu] First execution including JIT compilation took 3.268693208694458s.
-[triton-dejavu] First execution including JIT compilation took 0.9770853519439697s.
-[triton-dejavu] First execution including JIT compilation took 0.5697095394134521s.
-[triton-dejavu] First execution including JIT compilation took 3.2506027221679688s.
-bench_cudagraph failed with out of resource: shared memory, Required: 266752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 266752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 400896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 400896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 469504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 469504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 535040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 535040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 535040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 535040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.055661916732788s.
-[triton-dejavu] First execution including JIT compilation took 1.567265272140503s.
-[triton-dejavu] First execution including JIT compilation took 0.6991770267486572s.
-[triton-dejavu] First execution including JIT compilation took 4.434000730514526s.
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 402432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 533504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 533504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 533504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 533504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 536576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 536576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 670720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 670720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 801792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 801792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 801792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 801792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 939008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 939008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1070080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1070080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1070080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1070080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.2387633323669434s.
-[triton-dejavu] First execution including JIT compilation took 0.6717429161071777s.
-[triton-dejavu] First execution including JIT compilation took 0.4157521724700928s.
-[triton-dejavu] First execution including JIT compilation took 1.316270351409912s.
-[triton-dejavu] First execution including JIT compilation took 0.7596695423126221s.
-[triton-dejavu] First execution including JIT compilation took 0.4030904769897461s.
-[triton-dejavu] First execution including JIT compilation took 1.4970901012420654s.
-[triton-dejavu] First execution including JIT compilation took 0.9932441711425781s.
-[triton-dejavu] First execution including JIT compilation took 0.4329719543457031s.
-[triton-dejavu] First execution including JIT compilation took 1.4383361339569092s.
-[triton-dejavu] First execution including JIT compilation took 0.7822005748748779s.
-[triton-dejavu] First execution including JIT compilation took 0.5704188346862793s.
-[triton-dejavu] First execution including JIT compilation took 1.9332118034362793s.
-[triton-dejavu] First execution including JIT compilation took 1.031646728515625s.
-[triton-dejavu] First execution including JIT compilation took 0.5641162395477295s.
-[triton-dejavu] First execution including JIT compilation took 1.697702407836914s.
-[triton-dejavu] First execution including JIT compilation took 0.82403564453125s.
-[triton-dejavu] First execution including JIT compilation took 0.49923229217529297s.
-[triton-dejavu] First execution including JIT compilation took 1.6510090827941895s.
-[triton-dejavu] First execution including JIT compilation took 0.8686649799346924s.
-[triton-dejavu] First execution including JIT compilation took 0.49845027923583984s.
-[triton-dejavu] First execution including JIT compilation took 1.5030395984649658s.
-[triton-dejavu] First execution including JIT compilation took 1.132683277130127s.
-[triton-dejavu] First execution including JIT compilation took 0.5156295299530029s.
-[triton-dejavu] First execution including JIT compilation took 1.8180909156799316s.
-[triton-dejavu] First execution including JIT compilation took 0.8981871604919434s.
-[triton-dejavu] First execution including JIT compilation took 0.46332740783691406s.
-[triton-dejavu] First execution including JIT compilation took 2.3511245250701904s.
-[triton-dejavu] First execution including JIT compilation took 1.3827064037322998s.
-[triton-dejavu] First execution including JIT compilation took 0.6484944820404053s.
-[triton-dejavu] First execution including JIT compilation took 2.287813663482666s.
-[triton-dejavu] First execution including JIT compilation took 1.1250503063201904s.
-[triton-dejavu] First execution including JIT compilation took 0.5330066680908203s.
-[triton-dejavu] First execution including JIT compilation took 2.089167594909668s.
-[triton-dejavu] First execution including JIT compilation took 1.1511783599853516s.
-[triton-dejavu] First execution including JIT compilation took 0.5573456287384033s.
-[triton-dejavu] First execution including JIT compilation took 2.3208694458007812s.
-[triton-dejavu] First execution including JIT compilation took 1.2315797805786133s.
-[triton-dejavu] First execution including JIT compilation took 0.6272659301757812s.
-[triton-dejavu] First execution including JIT compilation took 2.580502510070801s.
-[triton-dejavu] First execution including JIT compilation took 1.3232295513153076s.
-[triton-dejavu] First execution including JIT compilation took 0.6254336833953857s.
-[triton-dejavu] First execution including JIT compilation took 2.4162003993988037s.
-[triton-dejavu] First execution including JIT compilation took 1.2951240539550781s.
-[triton-dejavu] First execution including JIT compilation took 0.6093685626983643s.
-[triton-dejavu] First execution including JIT compilation took 2.4826173782348633s.
-[triton-dejavu] First execution including JIT compilation took 1.444998025894165s.
-[triton-dejavu] First execution including JIT compilation took 0.6317059993743896s.
-[triton-dejavu] First execution including JIT compilation took 3.2404682636260986s.
-[triton-dejavu] First execution including JIT compilation took 1.592949628829956s.
-[triton-dejavu] First execution including JIT compilation took 0.7632882595062256s.
-[triton-dejavu] First execution including JIT compilation took 3.501128911972046s.
-[triton-dejavu] First execution including JIT compilation took 1.8167932033538818s.
-[triton-dejavu] First execution including JIT compilation took 0.7571108341217041s.
-[triton-dejavu] First execution including JIT compilation took 3.3915903568267822s.
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 349440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 349440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.791898488998413s.
-[triton-dejavu] First execution including JIT compilation took 2.434124231338501s.
-[triton-dejavu] First execution including JIT compilation took 1.172961711883545s.
-[triton-dejavu] First execution including JIT compilation took 5.012480020523071s.
-[triton-dejavu] First execution including JIT compilation took 2.740521192550659s.
-[triton-dejavu] First execution including JIT compilation took 1.2781014442443848s.
-[triton-dejavu] First execution including JIT compilation took 9.610878944396973s.
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 698880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 698880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 7.976198434829712s.
-[triton-dejavu] First execution including JIT compilation took 3.3016257286071777s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 798720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 798720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1397760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1397760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.31671142578125s.
-[triton-dejavu] First execution including JIT compilation took 0.22868680953979492s.
-[triton-dejavu] First execution including JIT compilation took 0.18102025985717773s.
-[triton-dejavu] First execution including JIT compilation took 0.30118656158447266s.
-[triton-dejavu] First execution including JIT compilation took 0.23821210861206055s.
-[triton-dejavu] First execution including JIT compilation took 0.20787501335144043s.
-[triton-dejavu] First execution including JIT compilation took 0.32696962356567383s.
-[triton-dejavu] First execution including JIT compilation took 0.2887406349182129s.
-[triton-dejavu] First execution including JIT compilation took 0.21834921836853027s.
-[triton-dejavu] First execution including JIT compilation took 0.3684656620025635s.
-[triton-dejavu] First execution including JIT compilation took 0.2897188663482666s.
-[triton-dejavu] First execution including JIT compilation took 0.21413230895996094s.
-[triton-dejavu] First execution including JIT compilation took 0.4073657989501953s.
-[triton-dejavu] First execution including JIT compilation took 0.29150938987731934s.
-[triton-dejavu] First execution including JIT compilation took 0.24649262428283691s.
-[triton-dejavu] First execution including JIT compilation took 0.3873109817504883s.
-[triton-dejavu] First execution including JIT compilation took 0.3143148422241211s.
-[triton-dejavu] First execution including JIT compilation took 0.25432682037353516s.
-[triton-dejavu] First execution including JIT compilation took 0.4593379497528076s.
-[triton-dejavu] First execution including JIT compilation took 0.3326547145843506s.
-[triton-dejavu] First execution including JIT compilation took 0.27237915992736816s.
-[triton-dejavu] First execution including JIT compilation took 0.37383532524108887s.
-[triton-dejavu] First execution including JIT compilation took 0.2576146125793457s.
-[triton-dejavu] First execution including JIT compilation took 0.2171943187713623s.
-[triton-dejavu] First execution including JIT compilation took 0.3749668598175049s.
-[triton-dejavu] First execution including JIT compilation took 0.3074686527252197s.
-[triton-dejavu] First execution including JIT compilation took 0.2472078800201416s.
-[triton-dejavu] First execution including JIT compilation took 0.4607219696044922s.
-[triton-dejavu] First execution including JIT compilation took 0.2899644374847412s.
-[triton-dejavu] First execution including JIT compilation took 0.29875612258911133s.
-[triton-dejavu] First execution including JIT compilation took 0.59027099609375s.
-[triton-dejavu] First execution including JIT compilation took 0.333834171295166s.
-[triton-dejavu] First execution including JIT compilation took 0.26287293434143066s.
-[triton-dejavu] First execution including JIT compilation took 0.6803445816040039s.
-[triton-dejavu] First execution including JIT compilation took 0.3831348419189453s.
-[triton-dejavu] First execution including JIT compilation took 0.30806612968444824s.
-[triton-dejavu] First execution including JIT compilation took 0.5800254344940186s.
-[triton-dejavu] First execution including JIT compilation took 0.42940402030944824s.
-[triton-dejavu] First execution including JIT compilation took 0.3212895393371582s.
-[triton-dejavu] First execution including JIT compilation took 0.6800441741943359s.
-[triton-dejavu] First execution including JIT compilation took 0.4293978214263916s.
-[triton-dejavu] First execution including JIT compilation took 0.31106042861938477s.
-[triton-dejavu] First execution including JIT compilation took 0.5731973648071289s.
-[triton-dejavu] First execution including JIT compilation took 0.3158423900604248s.
-[triton-dejavu] First execution including JIT compilation took 0.21982431411743164s.
-[triton-dejavu] First execution including JIT compilation took 0.6607396602630615s.
-[triton-dejavu] First execution including JIT compilation took 0.33777403831481934s.
-[triton-dejavu] First execution including JIT compilation took 0.26319289207458496s.
-[triton-dejavu] First execution including JIT compilation took 1.0609164237976074s.
-[triton-dejavu] First execution including JIT compilation took 0.42955660820007324s.
-[triton-dejavu] First execution including JIT compilation took 0.29787397384643555s.
-[triton-dejavu] First execution including JIT compilation took 1.1868953704833984s.
-[triton-dejavu] First execution including JIT compilation took 0.44803476333618164s.
-[triton-dejavu] First execution including JIT compilation took 0.31003737449645996s.
-[triton-dejavu] First execution including JIT compilation took 1.2543339729309082s.
-[triton-dejavu] First execution including JIT compilation took 0.5254006385803223s.
-[triton-dejavu] First execution including JIT compilation took 0.32144618034362793s.
-[triton-dejavu] First execution including JIT compilation took 1.3773908615112305s.
-[triton-dejavu] First execution including JIT compilation took 0.5259160995483398s.
-[triton-dejavu] First execution including JIT compilation took 0.34156179428100586s.
-bench_cudagraph failed with out of resource: shared memory, Required: 249088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 283904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 283904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.1119396686553955s.
-[triton-dejavu] First execution including JIT compilation took 0.4642174243927002s.
-[triton-dejavu] First execution including JIT compilation took 0.2669200897216797s.
-[triton-dejavu] First execution including JIT compilation took 1.3371021747589111s.
-[triton-dejavu] First execution including JIT compilation took 0.49892330169677734s.
-[triton-dejavu] First execution including JIT compilation took 0.318439245223999s.
-[triton-dejavu] First execution including JIT compilation took 4.929638385772705s.
-[triton-dejavu] First execution including JIT compilation took 0.9740848541259766s.
-[triton-dejavu] First execution including JIT compilation took 0.37487220764160156s.
-[triton-dejavu] First execution including JIT compilation took 5.154336214065552s.
-bench_cudagraph failed with out of resource: shared memory, Required: 283136, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283136, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 283136, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283136, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 425472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 425472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 425472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 425472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 498176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 567808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 567808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 567808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 567808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.1173014640808105s.
-[triton-dejavu] First execution including JIT compilation took 1.0967254638671875s.
-[triton-dejavu] First execution including JIT compilation took 0.48938989639282227s.
-[triton-dejavu] First execution including JIT compilation took 3.199140787124634s.
-bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 427008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 566272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 566272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 566272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 566272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 569344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 850944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 850944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 850944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 850944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 996352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1135616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1135616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1135616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1135616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.5043489933013916s.
-[triton-dejavu] First execution including JIT compilation took 0.3277888298034668s.
-[triton-dejavu] First execution including JIT compilation took 0.24386119842529297s.
-[triton-dejavu] First execution including JIT compilation took 0.5100774765014648s.
-[triton-dejavu] First execution including JIT compilation took 0.34099698066711426s.
-[triton-dejavu] First execution including JIT compilation took 0.2779710292816162s.
-[triton-dejavu] First execution including JIT compilation took 0.5361578464508057s.
-[triton-dejavu] First execution including JIT compilation took 0.3682215213775635s.
-[triton-dejavu] First execution including JIT compilation took 0.300004243850708s.
-[triton-dejavu] First execution including JIT compilation took 0.5743675231933594s.
-[triton-dejavu] First execution including JIT compilation took 0.3822929859161377s.
-[triton-dejavu] First execution including JIT compilation took 0.3132593631744385s.
-[triton-dejavu] First execution including JIT compilation took 0.600147008895874s.
-[triton-dejavu] First execution including JIT compilation took 0.406186580657959s.
-[triton-dejavu] First execution including JIT compilation took 0.328277587890625s.
-[triton-dejavu] First execution including JIT compilation took 0.6361839771270752s.
-[triton-dejavu] First execution including JIT compilation took 0.4613196849822998s.
-[triton-dejavu] First execution including JIT compilation took 0.34471893310546875s.
-[triton-dejavu] First execution including JIT compilation took 0.7103567123413086s.
-[triton-dejavu] First execution including JIT compilation took 0.46198368072509766s.
-[triton-dejavu] First execution including JIT compilation took 0.37233877182006836s.
-[triton-dejavu] First execution including JIT compilation took 0.5790450572967529s.
-[triton-dejavu] First execution including JIT compilation took 0.33751773834228516s.
-[triton-dejavu] First execution including JIT compilation took 0.26027369499206543s.
-[triton-dejavu] First execution including JIT compilation took 0.6138906478881836s.
-[triton-dejavu] First execution including JIT compilation took 0.4593079090118408s.
-[triton-dejavu] First execution including JIT compilation took 0.29265832901000977s.
-[triton-dejavu] First execution including JIT compilation took 0.6543664932250977s.
-[triton-dejavu] First execution including JIT compilation took 0.40736937522888184s.
-[triton-dejavu] First execution including JIT compilation took 0.2809460163116455s.
-[triton-dejavu] First execution including JIT compilation took 0.7575297355651855s.
-[triton-dejavu] First execution including JIT compilation took 0.33360981941223145s.
-[triton-dejavu] First execution including JIT compilation took 0.31966447830200195s.
-[triton-dejavu] First execution including JIT compilation took 0.8111255168914795s.
-[triton-dejavu] First execution including JIT compilation took 0.46427249908447266s.
-[triton-dejavu] First execution including JIT compilation took 0.34816551208496094s.
-[triton-dejavu] First execution including JIT compilation took 0.9121909141540527s.
-[triton-dejavu] First execution including JIT compilation took 0.49946069717407227s.
-[triton-dejavu] First execution including JIT compilation took 0.3831624984741211s.
-[triton-dejavu] First execution including JIT compilation took 1.057861328125s.
-[triton-dejavu] First execution including JIT compilation took 0.5552551746368408s.
-[triton-dejavu] First execution including JIT compilation took 0.3699655532836914s.
-[triton-dejavu] First execution including JIT compilation took 0.8594727516174316s.
-[triton-dejavu] First execution including JIT compilation took 0.42781662940979004s.
-[triton-dejavu] First execution including JIT compilation took 0.31317567825317383s.
-[triton-dejavu] First execution including JIT compilation took 0.9985849857330322s.
-[triton-dejavu] First execution including JIT compilation took 0.5023458003997803s.
-[triton-dejavu] First execution including JIT compilation took 0.37673401832580566s.
-[triton-dejavu] First execution including JIT compilation took 1.501765251159668s.
-[triton-dejavu] First execution including JIT compilation took 0.5676426887512207s.
-[triton-dejavu] First execution including JIT compilation took 0.3674488067626953s.
-[triton-dejavu] First execution including JIT compilation took 1.6308376789093018s.
-[triton-dejavu] First execution including JIT compilation took 0.6221895217895508s.
-[triton-dejavu] First execution including JIT compilation took 0.42139220237731934s.
-[triton-dejavu] First execution including JIT compilation took 1.6980812549591064s.
-[triton-dejavu] First execution including JIT compilation took 0.6677892208099365s.
-[triton-dejavu] First execution including JIT compilation took 0.4159400463104248s.
-[triton-dejavu] First execution including JIT compilation took 1.8085997104644775s.
-[triton-dejavu] First execution including JIT compilation took 0.7202484607696533s.
-[triton-dejavu] First execution including JIT compilation took 0.4506070613861084s.
-bench_cudagraph failed with out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 300288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 300288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.7220134735107422s.
-[triton-dejavu] First execution including JIT compilation took 0.6867120265960693s.
-[triton-dejavu] First execution including JIT compilation took 0.4079298973083496s.
-[triton-dejavu] First execution including JIT compilation took 1.9503588676452637s.
-[triton-dejavu] First execution including JIT compilation took 0.7429883480072021s.
-[triton-dejavu] First execution including JIT compilation took 0.46311044692993164s.
-[triton-dejavu] First execution including JIT compilation took 5.910313367843628s.
-[triton-dejavu] First execution including JIT compilation took 1.2488391399383545s.
-[triton-dejavu] First execution including JIT compilation took 0.5487070083618164s.
-[triton-dejavu] First execution including JIT compilation took 5.962830066680908s.
-bench_cudagraph failed with out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 376320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 450048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 450048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 450048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 450048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 600576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 600576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.886913776397705s.
-[triton-dejavu] First execution including JIT compilation took 1.1668055057525635s.
-[triton-dejavu] First execution including JIT compilation took 0.5872712135314941s.
-[triton-dejavu] First execution including JIT compilation took 4.414681434631348s.
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 451584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 602112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 749568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 749568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 749568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 749568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 752640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 900096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 900096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 900096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 900096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1201152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1201152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1201152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1201152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.7140195369720459s.
-[triton-dejavu] First execution including JIT compilation took 0.36057424545288086s.
-[triton-dejavu] First execution including JIT compilation took 0.25351953506469727s.
-[triton-dejavu] First execution including JIT compilation took 0.6811349391937256s.
-[triton-dejavu] First execution including JIT compilation took 0.39322566986083984s.
-[triton-dejavu] First execution including JIT compilation took 0.26434326171875s.
-[triton-dejavu] First execution including JIT compilation took 0.7010984420776367s.
-[triton-dejavu] First execution including JIT compilation took 0.3896751403808594s.
-[triton-dejavu] First execution including JIT compilation took 0.28528761863708496s.
-[triton-dejavu] First execution including JIT compilation took 0.7317478656768799s.
-[triton-dejavu] First execution including JIT compilation took 0.4223208427429199s.
-[triton-dejavu] First execution including JIT compilation took 0.28917455673217773s.
-[triton-dejavu] First execution including JIT compilation took 0.744981050491333s.
-[triton-dejavu] First execution including JIT compilation took 0.3946702480316162s.
-[triton-dejavu] First execution including JIT compilation took 0.29891490936279297s.
-[triton-dejavu] First execution including JIT compilation took 0.765406608581543s.
-[triton-dejavu] First execution including JIT compilation took 0.44653844833374023s.
-[triton-dejavu] First execution including JIT compilation took 0.33998966217041016s.
-[triton-dejavu] First execution including JIT compilation took 0.8716061115264893s.
-[triton-dejavu] First execution including JIT compilation took 0.4519209861755371s.
-[triton-dejavu] First execution including JIT compilation took 0.348386287689209s.
-[triton-dejavu] First execution including JIT compilation took 1.0358567237854004s.
-[triton-dejavu] First execution including JIT compilation took 0.4894859790802002s.
-[triton-dejavu] First execution including JIT compilation took 0.3279075622558594s.
-[triton-dejavu] First execution including JIT compilation took 1.148808479309082s.
-[triton-dejavu] First execution including JIT compilation took 0.5393466949462891s.
-[triton-dejavu] First execution including JIT compilation took 0.3747735023498535s.
-[triton-dejavu] First execution including JIT compilation took 1.237614631652832s.
-[triton-dejavu] First execution including JIT compilation took 0.5807638168334961s.
-[triton-dejavu] First execution including JIT compilation took 0.3793628215789795s.
-[triton-dejavu] First execution including JIT compilation took 1.323664903640747s.
-[triton-dejavu] First execution including JIT compilation took 0.6247925758361816s.
-[triton-dejavu] First execution including JIT compilation took 0.39437222480773926s.
-[triton-dejavu] First execution including JIT compilation took 1.3928866386413574s.
-[triton-dejavu] First execution including JIT compilation took 0.6385958194732666s.
-[triton-dejavu] First execution including JIT compilation took 0.4033050537109375s.
-[triton-dejavu] First execution including JIT compilation took 1.440335988998413s.
-[triton-dejavu] First execution including JIT compilation took 0.5338249206542969s.
-[triton-dejavu] First execution including JIT compilation took 0.33176136016845703s.
-[triton-dejavu] First execution including JIT compilation took 1.2540497779846191s.
-[triton-dejavu] First execution including JIT compilation took 0.5932145118713379s.
-[triton-dejavu] First execution including JIT compilation took 0.35477566719055176s.
-[triton-dejavu] First execution including JIT compilation took 1.5448269844055176s.
-[triton-dejavu] First execution including JIT compilation took 0.6225264072418213s.
-[triton-dejavu] First execution including JIT compilation took 0.36580657958984375s.
-[triton-dejavu] First execution including JIT compilation took 1.6768112182617188s.
-[triton-dejavu] First execution including JIT compilation took 0.6222131252288818s.
-[triton-dejavu] First execution including JIT compilation took 0.42403435707092285s.
-[triton-dejavu] First execution including JIT compilation took 1.895324945449829s.
-[triton-dejavu] First execution including JIT compilation took 0.6491637229919434s.
-[triton-dejavu] First execution including JIT compilation took 0.49414658546447754s.
-[triton-dejavu] First execution including JIT compilation took 2.1658642292022705s.
-[triton-dejavu] First execution including JIT compilation took 0.9408359527587891s.
-[triton-dejavu] First execution including JIT compilation took 0.4878816604614258s.
-[triton-dejavu] First execution including JIT compilation took 2.3840792179107666s.
-[triton-dejavu] First execution including JIT compilation took 0.8270976543426514s.
-[triton-dejavu] First execution including JIT compilation took 0.41714978218078613s.
-[triton-dejavu] First execution including JIT compilation took 2.040860891342163s.
-bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 292096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 292096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.6213810443878174s.
-[triton-dejavu] First execution including JIT compilation took 0.9269452095031738s.
-[triton-dejavu] First execution including JIT compilation took 0.47311830520629883s.
-[triton-dejavu] First execution including JIT compilation took 2.7501637935638428s.
-[triton-dejavu] First execution including JIT compilation took 0.7629375457763672s.
-[triton-dejavu] First execution including JIT compilation took 0.4511408805847168s.
-[triton-dejavu] First execution including JIT compilation took 7.134660243988037s.
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 332288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 415744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 584192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 584192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 666112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.504925966262817s.
-[triton-dejavu] First execution including JIT compilation took 1.498021125793457s.
-[triton-dejavu] First execution including JIT compilation took 0.7205624580383301s.
-[triton-dejavu] First execution including JIT compilation took 6.149832725524902s.
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 500736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 500736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 664576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 667648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 831488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 834560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 834560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1168384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1168384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1332224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.4407587051391602s.
-[triton-dejavu] First execution including JIT compilation took 0.7037980556488037s.
-[triton-dejavu] First execution including JIT compilation took 0.4034006595611572s.
-[triton-dejavu] First execution including JIT compilation took 1.492032527923584s.
-[triton-dejavu] First execution including JIT compilation took 0.8152525424957275s.
-[triton-dejavu] First execution including JIT compilation took 0.3937671184539795s.
-[triton-dejavu] First execution including JIT compilation took 1.604053020477295s.
-[triton-dejavu] First execution including JIT compilation took 0.850435733795166s.
-[triton-dejavu] First execution including JIT compilation took 0.4098947048187256s.
-[triton-dejavu] First execution including JIT compilation took 1.6307311058044434s.
-[triton-dejavu] First execution including JIT compilation took 0.7785723209381104s.
-[triton-dejavu] First execution including JIT compilation took 0.4134221076965332s.
-[triton-dejavu] First execution including JIT compilation took 1.6844274997711182s.
-[triton-dejavu] First execution including JIT compilation took 0.8327662944793701s.
-[triton-dejavu] First execution including JIT compilation took 0.4714367389678955s.
-[triton-dejavu] First execution including JIT compilation took 2.1540465354919434s.
-[triton-dejavu] First execution including JIT compilation took 0.8299758434295654s.
-[triton-dejavu] First execution including JIT compilation took 0.5292987823486328s.
-[triton-dejavu] First execution including JIT compilation took 1.8861517906188965s.
-[triton-dejavu] First execution including JIT compilation took 0.8495028018951416s.
-[triton-dejavu] First execution including JIT compilation took 0.44466233253479004s.
-[triton-dejavu] First execution including JIT compilation took 1.6645276546478271s.
-[triton-dejavu] First execution including JIT compilation took 0.8646507263183594s.
-[triton-dejavu] First execution including JIT compilation took 0.3824446201324463s.
-[triton-dejavu] First execution including JIT compilation took 1.8299148082733154s.
-[triton-dejavu] First execution including JIT compilation took 0.8459672927856445s.
-[triton-dejavu] First execution including JIT compilation took 0.4378163814544678s.
-[triton-dejavu] First execution including JIT compilation took 1.9301397800445557s.
-[triton-dejavu] First execution including JIT compilation took 0.8685927391052246s.
-[triton-dejavu] First execution including JIT compilation took 0.49677586555480957s.
-[triton-dejavu] First execution including JIT compilation took 2.1058199405670166s.
-[triton-dejavu] First execution including JIT compilation took 0.9192090034484863s.
-[triton-dejavu] First execution including JIT compilation took 0.4858896732330322s.
-[triton-dejavu] First execution including JIT compilation took 2.163212776184082s.
-[triton-dejavu] First execution including JIT compilation took 0.9346041679382324s.
-[triton-dejavu] First execution including JIT compilation took 0.516742467880249s.
-[triton-dejavu] First execution including JIT compilation took 2.2323033809661865s.
-[triton-dejavu] First execution including JIT compilation took 1.0213954448699951s.
-[triton-dejavu] First execution including JIT compilation took 0.5393123626708984s.
-[triton-dejavu] First execution including JIT compilation took 2.4064090251922607s.
-[triton-dejavu] First execution including JIT compilation took 1.1745574474334717s.
-[triton-dejavu] First execution including JIT compilation took 0.5458128452301025s.
-[triton-dejavu] First execution including JIT compilation took 2.5389962196350098s.
-[triton-dejavu] First execution including JIT compilation took 1.0145437717437744s.
-[triton-dejavu] First execution including JIT compilation took 0.5042729377746582s.
-[triton-dejavu] First execution including JIT compilation took 2.685384511947632s.
-[triton-dejavu] First execution including JIT compilation took 1.0922198295593262s.
-[triton-dejavu] First execution including JIT compilation took 0.534543514251709s.
-[triton-dejavu] First execution including JIT compilation took 3.543609380722046s.
-[triton-dejavu] First execution including JIT compilation took 1.221311330795288s.
-[triton-dejavu] First execution including JIT compilation took 0.5585613250732422s.
-[triton-dejavu] First execution including JIT compilation took 3.604875087738037s.
-[triton-dejavu] First execution including JIT compilation took 1.3217031955718994s.
-[triton-dejavu] First execution including JIT compilation took 0.6215760707855225s.
-[triton-dejavu] First execution including JIT compilation took 3.679218053817749s.
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 349440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 349440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.055574655532837s.
-[triton-dejavu] First execution including JIT compilation took 1.6165587902069092s.
-[triton-dejavu] First execution including JIT compilation took 0.7123830318450928s.
-[triton-dejavu] First execution including JIT compilation took 5.444074630737305s.
-[triton-dejavu] First execution including JIT compilation took 1.641197681427002s.
-[triton-dejavu] First execution including JIT compilation took 0.818612813949585s.
-[triton-dejavu] First execution including JIT compilation took 10.92102336883545s.
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 597504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 698880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 698880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 797184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 3.2677865028381348s.
-[triton-dejavu] First execution including JIT compilation took 1.392303705215454s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 599040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 798720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 798720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 995328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 998400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1195008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1397760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1397760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1594368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 3.4989428520202637s.
-[triton-dejavu] First execution including JIT compilation took 1.8932569026947021s.
-[triton-dejavu] First execution including JIT compilation took 0.732560396194458s.
-[triton-dejavu] First execution including JIT compilation took 3.6859214305877686s.
-[triton-dejavu] First execution including JIT compilation took 1.8280115127563477s.
-[triton-dejavu] First execution including JIT compilation took 0.7882623672485352s.
-[triton-dejavu] First execution including JIT compilation took 3.735793113708496s.
-[triton-dejavu] First execution including JIT compilation took 2.119565486907959s.
-[triton-dejavu] First execution including JIT compilation took 0.8172221183776855s.
-[triton-dejavu] First execution including JIT compilation took 4.001902341842651s.
-[triton-dejavu] First execution including JIT compilation took 1.949631690979004s.
-[triton-dejavu] First execution including JIT compilation took 0.8840606212615967s.
-[triton-dejavu] First execution including JIT compilation took 4.194988489151001s.
-[triton-dejavu] First execution including JIT compilation took 2.0192768573760986s.
-[triton-dejavu] First execution including JIT compilation took 0.8446464538574219s.
-[triton-dejavu] First execution including JIT compilation took 3.981654644012451s.
-[triton-dejavu] First execution including JIT compilation took 2.0983834266662598s.
-[triton-dejavu] First execution including JIT compilation took 0.9342923164367676s.
-[triton-dejavu] First execution including JIT compilation took 3.9925155639648438s.
-[triton-dejavu] First execution including JIT compilation took 2.2087132930755615s.
-[triton-dejavu] First execution including JIT compilation took 0.949195146560669s.
-[triton-dejavu] First execution including JIT compilation took 4.75992751121521s.
-[triton-dejavu] First execution including JIT compilation took 2.1755006313323975s.
-[triton-dejavu] First execution including JIT compilation took 1.1709823608398438s.
-[triton-dejavu] First execution including JIT compilation took 4.726720809936523s.
-[triton-dejavu] First execution including JIT compilation took 2.514338970184326s.
-[triton-dejavu] First execution including JIT compilation took 0.9337425231933594s.
-[triton-dejavu] First execution including JIT compilation took 5.192431449890137s.
-[triton-dejavu] First execution including JIT compilation took 2.444566488265991s.
-[triton-dejavu] First execution including JIT compilation took 0.9995050430297852s.
-[triton-dejavu] First execution including JIT compilation took 5.0464768409729s.
-[triton-dejavu] First execution including JIT compilation took 2.496889352798462s.
-[triton-dejavu] First execution including JIT compilation took 1.0279152393341064s.
-[triton-dejavu] First execution including JIT compilation took 5.1387224197387695s.
-[triton-dejavu] First execution including JIT compilation took 2.8983592987060547s.
-[triton-dejavu] First execution including JIT compilation took 1.048851490020752s.
-[triton-dejavu] First execution including JIT compilation took 5.0407774448394775s.
-[triton-dejavu] First execution including JIT compilation took 2.5578625202178955s.
-[triton-dejavu] First execution including JIT compilation took 1.0612678527832031s.
-bench_cudagraph failed with out of resource: shared memory, Required: 248448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.719822406768799s.
-[triton-dejavu] First execution including JIT compilation took 3.07711124420166s.
-[triton-dejavu] First execution including JIT compilation took 1.6147170066833496s.
-[triton-dejavu] First execution including JIT compilation took 6.198007345199585s.
-[triton-dejavu] First execution including JIT compilation took 3.1891562938690186s.
-[triton-dejavu] First execution including JIT compilation took 1.3675951957702637s.
-[triton-dejavu] First execution including JIT compilation took 6.984339952468872s.
-[triton-dejavu] First execution including JIT compilation took 3.410410165786743s.
-[triton-dejavu] First execution including JIT compilation took 1.380906581878662s.
-[triton-dejavu] First execution including JIT compilation took 7.323652982711792s.
-bench_cudagraph failed with out of resource: shared memory, Required: 264448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 464128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 464128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 529664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 529664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 529664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 529664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 6.1288557052612305s.
-[triton-dejavu] First execution including JIT compilation took 2.2384567260742188s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 528896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 528896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 663040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 794112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 794112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 794112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 794112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 928256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 928256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1059328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1059328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1059328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1059328, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 795648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1057792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1057792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1057792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1057792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1060864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1060864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1323008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1323008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1323008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1323008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1326080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1326080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1588224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1588224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1588224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1588224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1856512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1856512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2118656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2118656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2118656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2118656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] added BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 64, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None for _chunk_state_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-339ef229a46cc5e4fefcebbabe32af549b053e9d045b9c4c60da297149a339c9/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default and key ('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.float32', 'torch.int32')
-[2025-07-23 14:17:39] Triton autotuning for function _chunk_state_fwd_kernel finished after 9348.03s; best config selected: BLOCK_SIZE_M: 64, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 64, num_warps: 8, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None with benchmark time 0.003924777265638113;  evaluated 2625 configurations;
-[triton-dejavu] ('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32') not in cache, starting to tune...
-[triton-dejavu] [2025-07-23 14:17:39]  Started benchmarking of 168 configurations... (use_bo: False, run: 0)
-[triton-dejavu] First execution including JIT compilation took 0.09894037246704102s.
-[triton-dejavu] First execution including JIT compilation took 0.09794259071350098s.
-[triton-dejavu] First execution including JIT compilation took 0.09133148193359375s.
-[triton-dejavu] First execution including JIT compilation took 0.09437870979309082s.
-[triton-dejavu] First execution including JIT compilation took 0.1375281810760498s.
-[triton-dejavu] First execution including JIT compilation took 0.08841657638549805s.
-[triton-dejavu] First execution including JIT compilation took 0.09586286544799805s.
-[triton-dejavu] First execution including JIT compilation took 0.0908660888671875s.
-[triton-dejavu] First execution including JIT compilation took 0.0903022289276123s.
-[triton-dejavu] First execution including JIT compilation took 0.11986613273620605s.
-[triton-dejavu] First execution including JIT compilation took 0.09716367721557617s.
-[triton-dejavu] First execution including JIT compilation took 0.10063719749450684s.
-[triton-dejavu] First execution including JIT compilation took 0.10032272338867188s.
-[triton-dejavu] First execution including JIT compilation took 0.1031808853149414s.
-[triton-dejavu] First execution including JIT compilation took 0.10322737693786621s.
-[triton-dejavu] First execution including JIT compilation took 0.09868526458740234s.
-[triton-dejavu] First execution including JIT compilation took 0.10200619697570801s.
-[triton-dejavu] First execution including JIT compilation took 0.10516095161437988s.
-[triton-dejavu] First execution including JIT compilation took 0.10310220718383789s.
-[triton-dejavu] First execution including JIT compilation took 0.10310649871826172s.
-[triton-dejavu] First execution including JIT compilation took 0.0993356704711914s.
-[triton-dejavu] First execution including JIT compilation took 0.0988612174987793s.
-[triton-dejavu] First execution including JIT compilation took 0.10302448272705078s.
-[triton-dejavu] First execution including JIT compilation took 0.10656142234802246s.
-[triton-dejavu] First execution including JIT compilation took 0.10141181945800781s.
-[triton-dejavu] First execution including JIT compilation took 0.09873580932617188s.
-[triton-dejavu] First execution including JIT compilation took 0.09862279891967773s.
-[triton-dejavu] First execution including JIT compilation took 0.10381484031677246s.
-[triton-dejavu] First execution including JIT compilation took 0.003009319305419922s.
-[triton-dejavu] First execution including JIT compilation took 0.0979304313659668s.
-[triton-dejavu] First execution including JIT compilation took 0.10118842124938965s.
-[triton-dejavu] First execution including JIT compilation took 0.1023719310760498s.
-[triton-dejavu] First execution including JIT compilation took 0.09882235527038574s.
-[triton-dejavu] First execution including JIT compilation took 0.10287618637084961s.
-[triton-dejavu] First execution including JIT compilation took 0.10065412521362305s.
-[triton-dejavu] First execution including JIT compilation took 0.1038670539855957s.
-[triton-dejavu] First execution including JIT compilation took 0.10204434394836426s.
-[triton-dejavu] First execution including JIT compilation took 0.10319113731384277s.
-[triton-dejavu] First execution including JIT compilation took 0.10232353210449219s.
-[triton-dejavu] First execution including JIT compilation took 0.09966063499450684s.
-[triton-dejavu] First execution including JIT compilation took 0.1052091121673584s.
-[triton-dejavu] First execution including JIT compilation took 0.09914350509643555s.
-[triton-dejavu] First execution including JIT compilation took 0.10142302513122559s.
-[triton-dejavu] First execution including JIT compilation took 0.10095047950744629s.
-[triton-dejavu] First execution including JIT compilation took 0.0988004207611084s.
-[triton-dejavu] First execution including JIT compilation took 0.10207676887512207s.
-[triton-dejavu] First execution including JIT compilation took 0.10127758979797363s.
-[triton-dejavu] First execution including JIT compilation took 0.10261845588684082s.
-[triton-dejavu] First execution including JIT compilation took 0.10390377044677734s.
-[triton-dejavu] First execution including JIT compilation took 0.0035130977630615234s.
-[triton-dejavu] First execution including JIT compilation took 0.10114288330078125s.
-[triton-dejavu] First execution including JIT compilation took 0.10671138763427734s.
-[triton-dejavu] First execution including JIT compilation took 0.09997725486755371s.
-[triton-dejavu] First execution including JIT compilation took 0.1009225845336914s.
-[triton-dejavu] First execution including JIT compilation took 0.1023416519165039s.
-[triton-dejavu] First execution including JIT compilation took 0.10103917121887207s.
-[triton-dejavu] First execution including JIT compilation took 0.10527372360229492s.
-[triton-dejavu] First execution including JIT compilation took 0.1026146411895752s.
-[triton-dejavu] First execution including JIT compilation took 0.09933710098266602s.
-[triton-dejavu] First execution including JIT compilation took 0.10023188591003418s.
-[triton-dejavu] First execution including JIT compilation took 0.10262489318847656s.
-[triton-dejavu] First execution including JIT compilation took 0.10224008560180664s.
-[triton-dejavu] First execution including JIT compilation took 0.1036233901977539s.
-[triton-dejavu] First execution including JIT compilation took 0.10158872604370117s.
-[triton-dejavu] First execution including JIT compilation took 0.10198545455932617s.
-[triton-dejavu] First execution including JIT compilation took 0.09837794303894043s.
-[triton-dejavu] First execution including JIT compilation took 0.10406804084777832s.
-[triton-dejavu] First execution including JIT compilation took 0.09899592399597168s.
-[triton-dejavu] First execution including JIT compilation took 0.10302305221557617s.
-[triton-dejavu] First execution including JIT compilation took 0.10536479949951172s.
-[triton-dejavu] First execution including JIT compilation took 0.0029289722442626953s.
-[triton-dejavu] First execution including JIT compilation took 0.10014629364013672s.
-[triton-dejavu] First execution including JIT compilation took 0.10506725311279297s.
-[triton-dejavu] First execution including JIT compilation took 0.10248923301696777s.
-[triton-dejavu] First execution including JIT compilation took 0.14687442779541016s.
-[triton-dejavu] First execution including JIT compilation took 0.10432600975036621s.
-[triton-dejavu] First execution including JIT compilation took 0.10353231430053711s.
-[triton-dejavu] First execution including JIT compilation took 0.09760165214538574s.
-[triton-dejavu] First execution including JIT compilation took 0.10556888580322266s.
-[triton-dejavu] First execution including JIT compilation took 0.10235834121704102s.
-[triton-dejavu] First execution including JIT compilation took 0.10123991966247559s.
-[triton-dejavu] First execution including JIT compilation took 0.10434556007385254s.
-[triton-dejavu] First execution including JIT compilation took 0.10260486602783203s.
-[triton-dejavu] First execution including JIT compilation took 0.09864091873168945s.
-[triton-dejavu] First execution including JIT compilation took 0.11648225784301758s.
-[triton-dejavu] First execution including JIT compilation took 0.10767412185668945s.
-[triton-dejavu] First execution including JIT compilation took 0.09799385070800781s.
-[triton-dejavu] First execution including JIT compilation took 0.12165379524230957s.
-[triton-dejavu] First execution including JIT compilation took 0.10323834419250488s.
-[triton-dejavu] First execution including JIT compilation took 0.1541898250579834s.
-[triton-dejavu] First execution including JIT compilation took 0.12132406234741211s.
-[triton-dejavu] First execution including JIT compilation took 0.0029497146606445312s.
-[triton-dejavu] First execution including JIT compilation took 0.10245823860168457s.
-[triton-dejavu] First execution including JIT compilation took 0.12180399894714355s.
-[triton-dejavu] First execution including JIT compilation took 0.10425543785095215s.
-[triton-dejavu] First execution including JIT compilation took 0.09953522682189941s.
-[triton-dejavu] First execution including JIT compilation took 0.11828899383544922s.
-[triton-dejavu] First execution including JIT compilation took 0.1033179759979248s.
-[triton-dejavu] First execution including JIT compilation took 0.09932780265808105s.
-[triton-dejavu] First execution including JIT compilation took 0.12465763092041016s.
-[triton-dejavu] First execution including JIT compilation took 0.10105299949645996s.
-[triton-dejavu] First execution including JIT compilation took 0.10276103019714355s.
-[triton-dejavu] First execution including JIT compilation took 0.11753654479980469s.
-[triton-dejavu] First execution including JIT compilation took 0.10199093818664551s.
-[triton-dejavu] First execution including JIT compilation took 0.10073208808898926s.
-[triton-dejavu] First execution including JIT compilation took 0.13183021545410156s.
-[triton-dejavu] First execution including JIT compilation took 0.11950993537902832s.
-[triton-dejavu] First execution including JIT compilation took 0.10588884353637695s.
-[triton-dejavu] First execution including JIT compilation took 0.12990760803222656s.
-[triton-dejavu] First execution including JIT compilation took 0.12318229675292969s.
-[triton-dejavu] First execution including JIT compilation took 0.10443115234375s.
-[triton-dejavu] First execution including JIT compilation took 0.1310744285583496s.
-[triton-dejavu] First execution including JIT compilation took 0.0029449462890625s.
-[triton-dejavu] First execution including JIT compilation took 0.10070252418518066s.
-[triton-dejavu] First execution including JIT compilation took 0.13354873657226562s.
-[triton-dejavu] First execution including JIT compilation took 0.1201629638671875s.
-[triton-dejavu] First execution including JIT compilation took 0.10523557662963867s.
-[triton-dejavu] First execution including JIT compilation took 0.12919878959655762s.
-[triton-dejavu] First execution including JIT compilation took 0.12137508392333984s.
-[triton-dejavu] First execution including JIT compilation took 0.10500240325927734s.
-[triton-dejavu] First execution including JIT compilation took 0.1276566982269287s.
-[triton-dejavu] First execution including JIT compilation took 0.12091207504272461s.
-[triton-dejavu] First execution including JIT compilation took 0.0987248420715332s.
-[triton-dejavu] First execution including JIT compilation took 0.12950658798217773s.
-[triton-dejavu] First execution including JIT compilation took 0.12207913398742676s.
-[triton-dejavu] First execution including JIT compilation took 0.10215497016906738s.
-[triton-dejavu] First execution including JIT compilation took 0.16368603706359863s.
-[triton-dejavu] First execution including JIT compilation took 0.1331336498260498s.
-[triton-dejavu] First execution including JIT compilation took 0.12007308006286621s.
-[triton-dejavu] First execution including JIT compilation took 0.1515827178955078s.
-[triton-dejavu] First execution including JIT compilation took 0.1267712116241455s.
-[triton-dejavu] First execution including JIT compilation took 0.12165069580078125s.
-[triton-dejavu] First execution including JIT compilation took 0.1454930305480957s.
-[triton-dejavu] First execution including JIT compilation took 0.003009319305419922s.
-[triton-dejavu] First execution including JIT compilation took 0.11997270584106445s.
-[triton-dejavu] First execution including JIT compilation took 0.15126681327819824s.
-[triton-dejavu] First execution including JIT compilation took 0.12771224975585938s.
-[triton-dejavu] First execution including JIT compilation took 0.11920666694641113s.
-[triton-dejavu] First execution including JIT compilation took 0.14812898635864258s.
-[triton-dejavu] First execution including JIT compilation took 0.13222432136535645s.
-[triton-dejavu] First execution including JIT compilation took 0.11958074569702148s.
-[triton-dejavu] First execution including JIT compilation took 0.15191054344177246s.
-[triton-dejavu] First execution including JIT compilation took 0.13075757026672363s.
-[triton-dejavu] First execution including JIT compilation took 0.11592960357666016s.
-[triton-dejavu] First execution including JIT compilation took 0.14753198623657227s.
-[triton-dejavu] First execution including JIT compilation took 0.1284317970275879s.
-[triton-dejavu] First execution including JIT compilation took 0.12173199653625488s.
-[triton-dejavu] First execution including JIT compilation took 0.18095922470092773s.
-[triton-dejavu] First execution including JIT compilation took 0.14812517166137695s.
-[triton-dejavu] First execution including JIT compilation took 0.1308438777923584s.
-[triton-dejavu] First execution including JIT compilation took 0.18799710273742676s.
-[triton-dejavu] First execution including JIT compilation took 0.14983463287353516s.
-[triton-dejavu] First execution including JIT compilation took 0.13013529777526855s.
-[triton-dejavu] First execution including JIT compilation took 0.22894525527954102s.
-[triton-dejavu] First execution including JIT compilation took 0.14696717262268066s.
-[triton-dejavu] First execution including JIT compilation took 0.1867072582244873s.
-[triton-dejavu] First execution including JIT compilation took 0.18308377265930176s.
-[triton-dejavu] First execution including JIT compilation took 0.15039896965026855s.
-[triton-dejavu] First execution including JIT compilation took 0.12948107719421387s.
-[triton-dejavu] First execution including JIT compilation took 0.1869828701019287s.
-[triton-dejavu] First execution including JIT compilation took 0.15206503868103027s.
-[triton-dejavu] First execution including JIT compilation took 0.12766575813293457s.
-[triton-dejavu] First execution including JIT compilation took 0.19213628768920898s.
-[triton-dejavu] First execution including JIT compilation took 0.14951205253601074s.
-[triton-dejavu] First execution including JIT compilation took 0.131011962890625s.
-[triton-dejavu] First execution including JIT compilation took 0.19015717506408691s.
-[triton-dejavu] First execution including JIT compilation took 0.14911913871765137s.
-[triton-dejavu] First execution including JIT compilation took 0.1303267478942871s.
-[triton-dejavu] added BLOCK_SIZE: 512, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None for _state_passing_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default and key ('8192', 'torch.float32', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32')
-[2025-07-23 14:22:15] Triton autotuning for function _state_passing_fwd_kernel finished after 275.26s; best config selected: BLOCK_SIZE: 512, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None with benchmark time 0.0030820679385215044;  evaluated 168 configurations;
-[triton-dejavu] ('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32') not in cache, starting to tune...
-[triton-dejavu] [2025-07-23 14:22:15]  Started benchmarking of 2625 configurations... (use_bo: False, run: 0)
-[triton-dejavu] First execution including JIT compilation took 0.1971287727355957s.
-[triton-dejavu] First execution including JIT compilation took 0.18145108222961426s.
-[triton-dejavu] First execution including JIT compilation took 0.18181228637695312s.
-[triton-dejavu] First execution including JIT compilation took 0.20481252670288086s.
-[triton-dejavu] First execution including JIT compilation took 0.19466614723205566s.
-[triton-dejavu] First execution including JIT compilation took 0.17426085472106934s.
-[triton-dejavu] First execution including JIT compilation took 0.21188688278198242s.
-[triton-dejavu] First execution including JIT compilation took 0.20443081855773926s.
-[triton-dejavu] First execution including JIT compilation took 0.18296051025390625s.
-[triton-dejavu] First execution including JIT compilation took 0.21415448188781738s.
-[triton-dejavu] First execution including JIT compilation took 0.20465874671936035s.
-[triton-dejavu] First execution including JIT compilation took 0.1801447868347168s.
-[triton-dejavu] First execution including JIT compilation took 0.21986842155456543s.
-[triton-dejavu] First execution including JIT compilation took 0.2162468433380127s.
-[triton-dejavu] First execution including JIT compilation took 0.17408537864685059s.
-[triton-dejavu] First execution including JIT compilation took 0.23129940032958984s.
-[triton-dejavu] First execution including JIT compilation took 0.22765421867370605s.
-[triton-dejavu] First execution including JIT compilation took 0.171464204788208s.
-[triton-dejavu] First execution including JIT compilation took 0.24284863471984863s.
-[triton-dejavu] First execution including JIT compilation took 0.2351231575012207s.
-[triton-dejavu] First execution including JIT compilation took 0.17095470428466797s.
-[triton-dejavu] First execution including JIT compilation took 0.19266152381896973s.
-[triton-dejavu] First execution including JIT compilation took 0.19104242324829102s.
-[triton-dejavu] First execution including JIT compilation took 0.1844947338104248s.
-[triton-dejavu] First execution including JIT compilation took 0.20961451530456543s.
-[triton-dejavu] First execution including JIT compilation took 0.20723509788513184s.
-[triton-dejavu] First execution including JIT compilation took 0.20163917541503906s.
-[triton-dejavu] First execution including JIT compilation took 0.22046804428100586s.
-[triton-dejavu] First execution including JIT compilation took 0.21246051788330078s.
-[triton-dejavu] First execution including JIT compilation took 0.21422886848449707s.
-[triton-dejavu] First execution including JIT compilation took 0.2310943603515625s.
-[triton-dejavu] First execution including JIT compilation took 0.22539591789245605s.
-[triton-dejavu] First execution including JIT compilation took 0.21231532096862793s.
-[triton-dejavu] First execution including JIT compilation took 0.23757481575012207s.
-[triton-dejavu] First execution including JIT compilation took 0.2188396453857422s.
-[triton-dejavu] First execution including JIT compilation took 0.22507905960083008s.
-[triton-dejavu] First execution including JIT compilation took 0.25089573860168457s.
-[triton-dejavu] First execution including JIT compilation took 0.2278902530670166s.
-[triton-dejavu] First execution including JIT compilation took 0.22785520553588867s.
-[triton-dejavu] First execution including JIT compilation took 0.2572140693664551s.
-[triton-dejavu] First execution including JIT compilation took 0.24475598335266113s.
-[triton-dejavu] First execution including JIT compilation took 0.275850772857666s.
-[triton-dejavu] First execution including JIT compilation took 0.21295976638793945s.
-[triton-dejavu] First execution including JIT compilation took 0.19115710258483887s.
-[triton-dejavu] First execution including JIT compilation took 0.18381786346435547s.
-[triton-dejavu] First execution including JIT compilation took 0.21824884414672852s.
-[triton-dejavu] First execution including JIT compilation took 0.2043604850769043s.
-[triton-dejavu] First execution including JIT compilation took 0.2003331184387207s.
-[triton-dejavu] First execution including JIT compilation took 0.2335672378540039s.
-[triton-dejavu] First execution including JIT compilation took 0.21690750122070312s.
-[triton-dejavu] First execution including JIT compilation took 0.18306660652160645s.
-[triton-dejavu] First execution including JIT compilation took 0.2298128604888916s.
-[triton-dejavu] First execution including JIT compilation took 0.19614720344543457s.
-[triton-dejavu] First execution including JIT compilation took 0.2060997486114502s.
-[triton-dejavu] First execution including JIT compilation took 0.21973228454589844s.
-[triton-dejavu] First execution including JIT compilation took 0.18016934394836426s.
-[triton-dejavu] First execution including JIT compilation took 0.20123672485351562s.
-[triton-dejavu] First execution including JIT compilation took 0.24709606170654297s.
-[triton-dejavu] First execution including JIT compilation took 0.20981693267822266s.
-[triton-dejavu] First execution including JIT compilation took 0.2014932632446289s.
-[triton-dejavu] First execution including JIT compilation took 0.25247669219970703s.
-[triton-dejavu] First execution including JIT compilation took 0.20742201805114746s.
-[triton-dejavu] First execution including JIT compilation took 0.23698949813842773s.
-[triton-dejavu] First execution including JIT compilation took 0.20906853675842285s.
-[triton-dejavu] First execution including JIT compilation took 0.19327425956726074s.
-[triton-dejavu] First execution including JIT compilation took 0.1958320140838623s.
-[triton-dejavu] First execution including JIT compilation took 0.22327661514282227s.
-[triton-dejavu] First execution including JIT compilation took 0.2055678367614746s.
-[triton-dejavu] First execution including JIT compilation took 0.22722506523132324s.
-[triton-dejavu] First execution including JIT compilation took 0.29752469062805176s.
-[triton-dejavu] First execution including JIT compilation took 0.24663901329040527s.
-[triton-dejavu] First execution including JIT compilation took 0.22910308837890625s.
-[triton-dejavu] First execution including JIT compilation took 0.30620813369750977s.
-[triton-dejavu] First execution including JIT compilation took 0.2616004943847656s.
-[triton-dejavu] First execution including JIT compilation took 0.24086618423461914s.
-[triton-dejavu] First execution including JIT compilation took 0.31242823600769043s.
-[triton-dejavu] First execution including JIT compilation took 0.26506876945495605s.
-[triton-dejavu] First execution including JIT compilation took 0.24354910850524902s.
-[triton-dejavu] First execution including JIT compilation took 0.33380126953125s.
-[triton-dejavu] First execution including JIT compilation took 0.27773475646972656s.
-[triton-dejavu] First execution including JIT compilation took 0.25086140632629395s.
-[triton-dejavu] First execution including JIT compilation took 0.3661017417907715s.
-[triton-dejavu] First execution including JIT compilation took 0.29815053939819336s.
-[triton-dejavu] First execution including JIT compilation took 0.26163578033447266s.
-[triton-dejavu] First execution including JIT compilation took 0.32303476333618164s.
-[triton-dejavu] First execution including JIT compilation took 0.26395726203918457s.
-[triton-dejavu] First execution including JIT compilation took 0.24637055397033691s.
-[triton-dejavu] First execution including JIT compilation took 0.35486674308776855s.
-[triton-dejavu] First execution including JIT compilation took 0.2868657112121582s.
-[triton-dejavu] First execution including JIT compilation took 0.25715160369873047s.
-[triton-dejavu] First execution including JIT compilation took 0.38285207748413086s.
-[triton-dejavu] First execution including JIT compilation took 0.31813502311706543s.
-[triton-dejavu] First execution including JIT compilation took 0.2733750343322754s.
-[triton-dejavu] First execution including JIT compilation took 0.4395263195037842s.
-[triton-dejavu] First execution including JIT compilation took 0.3358616828918457s.
-[triton-dejavu] First execution including JIT compilation took 0.29150915145874023s.
-[triton-dejavu] First execution including JIT compilation took 0.5277695655822754s.
-[triton-dejavu] First execution including JIT compilation took 0.3672621250152588s.
-[triton-dejavu] First execution including JIT compilation took 0.3096439838409424s.
-[triton-dejavu] First execution including JIT compilation took 0.5141489505767822s.
-[triton-dejavu] First execution including JIT compilation took 0.37669992446899414s.
-[triton-dejavu] First execution including JIT compilation took 0.311464786529541s.
-[triton-dejavu] First execution including JIT compilation took 0.5582582950592041s.
-[triton-dejavu] First execution including JIT compilation took 0.4001944065093994s.
-[triton-dejavu] First execution including JIT compilation took 0.3267343044281006s.
-[triton-dejavu] First execution including JIT compilation took 0.19158482551574707s.
-[triton-dejavu] First execution including JIT compilation took 0.18754005432128906s.
-[triton-dejavu] First execution including JIT compilation took 0.17347002029418945s.
-[triton-dejavu] First execution including JIT compilation took 0.20695137977600098s.
-[triton-dejavu] First execution including JIT compilation took 0.1933000087738037s.
-[triton-dejavu] First execution including JIT compilation took 0.18866968154907227s.
-[triton-dejavu] First execution including JIT compilation took 0.21515560150146484s.
-[triton-dejavu] First execution including JIT compilation took 0.22445225715637207s.
-[triton-dejavu] First execution including JIT compilation took 0.19047832489013672s.
-[triton-dejavu] First execution including JIT compilation took 0.21751952171325684s.
-[triton-dejavu] First execution including JIT compilation took 0.2095792293548584s.
-[triton-dejavu] First execution including JIT compilation took 0.1899867057800293s.
-[triton-dejavu] First execution including JIT compilation took 0.22539901733398438s.
-[triton-dejavu] First execution including JIT compilation took 0.214493989944458s.
-[triton-dejavu] First execution including JIT compilation took 0.19467592239379883s.
-[triton-dejavu] First execution including JIT compilation took 0.23510289192199707s.
-[triton-dejavu] First execution including JIT compilation took 0.23056340217590332s.
-[triton-dejavu] First execution including JIT compilation took 0.20400166511535645s.
-[triton-dejavu] First execution including JIT compilation took 0.25162243843078613s.
-[triton-dejavu] First execution including JIT compilation took 0.24202203750610352s.
-[triton-dejavu] First execution including JIT compilation took 0.2179243564605713s.
-[triton-dejavu] First execution including JIT compilation took 0.20107483863830566s.
-[triton-dejavu] First execution including JIT compilation took 0.19151616096496582s.
-[triton-dejavu] First execution including JIT compilation took 0.1852731704711914s.
-[triton-dejavu] First execution including JIT compilation took 0.2255268096923828s.
-[triton-dejavu] First execution including JIT compilation took 0.20557308197021484s.
-[triton-dejavu] First execution including JIT compilation took 0.19893908500671387s.
-[triton-dejavu] First execution including JIT compilation took 0.2369074821472168s.
-[triton-dejavu] First execution including JIT compilation took 0.22331023216247559s.
-[triton-dejavu] First execution including JIT compilation took 0.2091996669769287s.
-[triton-dejavu] First execution including JIT compilation took 0.2456064224243164s.
-[triton-dejavu] First execution including JIT compilation took 0.24354219436645508s.
-[triton-dejavu] First execution including JIT compilation took 0.2187485694885254s.
-[triton-dejavu] First execution including JIT compilation took 0.26705098152160645s.
-[triton-dejavu] First execution including JIT compilation took 0.22890710830688477s.
-[triton-dejavu] First execution including JIT compilation took 0.228562593460083s.
-[triton-dejavu] First execution including JIT compilation took 0.260115385055542s.
-[triton-dejavu] First execution including JIT compilation took 0.23436951637268066s.
-[triton-dejavu] First execution including JIT compilation took 0.24272942543029785s.
-[triton-dejavu] First execution including JIT compilation took 0.2728395462036133s.
-[triton-dejavu] First execution including JIT compilation took 0.25573110580444336s.
-[triton-dejavu] First execution including JIT compilation took 0.239990234375s.
-[triton-dejavu] First execution including JIT compilation took 0.23782968521118164s.
-[triton-dejavu] First execution including JIT compilation took 0.20571660995483398s.
-[triton-dejavu] First execution including JIT compilation took 0.1985173225402832s.
-[triton-dejavu] First execution including JIT compilation took 0.23909878730773926s.
-[triton-dejavu] First execution including JIT compilation took 0.21619272232055664s.
-[triton-dejavu] First execution including JIT compilation took 0.205078125s.
-[triton-dejavu] First execution including JIT compilation took 0.25579833984375s.
-[triton-dejavu] First execution including JIT compilation took 0.22826004028320312s.
-[triton-dejavu] First execution including JIT compilation took 0.21488690376281738s.
-[triton-dejavu] First execution including JIT compilation took 0.2719230651855469s.
-[triton-dejavu] First execution including JIT compilation took 0.237349271774292s.
-[triton-dejavu] First execution including JIT compilation took 0.22726154327392578s.
-[triton-dejavu] First execution including JIT compilation took 0.29409146308898926s.
-[triton-dejavu] First execution including JIT compilation took 0.2537970542907715s.
-[triton-dejavu] First execution including JIT compilation took 0.2349834442138672s.
-[triton-dejavu] First execution including JIT compilation took 0.3101375102996826s.
-[triton-dejavu] First execution including JIT compilation took 0.25778889656066895s.
-[triton-dejavu] First execution including JIT compilation took 0.2488398551940918s.
-[triton-dejavu] First execution including JIT compilation took 0.3380768299102783s.
-[triton-dejavu] First execution including JIT compilation took 0.24480175971984863s.
-[triton-dejavu] First execution including JIT compilation took 0.24767565727233887s.
-[triton-dejavu] First execution including JIT compilation took 0.24734854698181152s.
-[triton-dejavu] First execution including JIT compilation took 0.22959280014038086s.
-[triton-dejavu] First execution including JIT compilation took 0.19723773002624512s.
-[triton-dejavu] First execution including JIT compilation took 0.2590954303741455s.
-[triton-dejavu] First execution including JIT compilation took 0.23061442375183105s.
-[triton-dejavu] First execution including JIT compilation took 0.19670867919921875s.
-[triton-dejavu] First execution including JIT compilation took 0.3171346187591553s.
-[triton-dejavu] First execution including JIT compilation took 0.232527494430542s.
-[triton-dejavu] First execution including JIT compilation took 0.22737908363342285s.
-[triton-dejavu] First execution including JIT compilation took 0.3899686336517334s.
-[triton-dejavu] First execution including JIT compilation took 0.278536319732666s.
-[triton-dejavu] First execution including JIT compilation took 0.24887967109680176s.
-[triton-dejavu] First execution including JIT compilation took 0.40813517570495605s.
-[triton-dejavu] First execution including JIT compilation took 0.29118990898132324s.
-[triton-dejavu] First execution including JIT compilation took 0.25837135314941406s.
-[triton-dejavu] First execution including JIT compilation took 0.42389464378356934s.
-[triton-dejavu] First execution including JIT compilation took 0.3060779571533203s.
-[triton-dejavu] First execution including JIT compilation took 0.26450538635253906s.
-[triton-dejavu] First execution including JIT compilation took 0.49116992950439453s.
-[triton-dejavu] First execution including JIT compilation took 0.33746862411499023s.
-[triton-dejavu] First execution including JIT compilation took 0.28478169441223145s.
-[triton-dejavu] First execution including JIT compilation took 0.41333556175231934s.
-[triton-dejavu] First execution including JIT compilation took 0.29308557510375977s.
-[triton-dejavu] First execution including JIT compilation took 0.2590651512145996s.
-[triton-dejavu] First execution including JIT compilation took 0.4438004493713379s.
-[triton-dejavu] First execution including JIT compilation took 0.38523340225219727s.
-[triton-dejavu] First execution including JIT compilation took 0.2776186466217041s.
-[triton-dejavu] First execution including JIT compilation took 0.5478754043579102s.
-[triton-dejavu] First execution including JIT compilation took 0.3559560775756836s.
-[triton-dejavu] First execution including JIT compilation took 0.29976773262023926s.
-[triton-dejavu] First execution including JIT compilation took 0.5995876789093018s.
-[triton-dejavu] First execution including JIT compilation took 0.38107895851135254s.
-[triton-dejavu] First execution including JIT compilation took 0.3243865966796875s.
-[triton-dejavu] First execution including JIT compilation took 0.7119507789611816s.
-[triton-dejavu] First execution including JIT compilation took 0.4182438850402832s.
-[triton-dejavu] First execution including JIT compilation took 0.33512067794799805s.
-[triton-dejavu] First execution including JIT compilation took 0.7309103012084961s.
-[triton-dejavu] First execution including JIT compilation took 0.4567563533782959s.
-[triton-dejavu] First execution including JIT compilation took 0.3427090644836426s.
-[triton-dejavu] First execution including JIT compilation took 0.7929611206054688s.
-[triton-dejavu] First execution including JIT compilation took 0.7045941352844238s.
-[triton-dejavu] First execution including JIT compilation took 0.36629557609558105s.
-[triton-dejavu] First execution including JIT compilation took 0.22418737411499023s.
-[triton-dejavu] First execution including JIT compilation took 0.1955420970916748s.
-[triton-dejavu] First execution including JIT compilation took 0.44211864471435547s.
-[triton-dejavu] First execution including JIT compilation took 0.23611164093017578s.
-[triton-dejavu] First execution including JIT compilation took 0.36040568351745605s.
-[triton-dejavu] First execution including JIT compilation took 0.19045591354370117s.
-[triton-dejavu] First execution including JIT compilation took 0.24911093711853027s.
-[triton-dejavu] First execution including JIT compilation took 0.2074282169342041s.
-[triton-dejavu] First execution including JIT compilation took 0.19919967651367188s.
-[triton-dejavu] First execution including JIT compilation took 0.24654054641723633s.
-[triton-dejavu] First execution including JIT compilation took 0.21567964553833008s.
-[triton-dejavu] First execution including JIT compilation took 0.2518477439880371s.
-[triton-dejavu] First execution including JIT compilation took 0.2529888153076172s.
-[triton-dejavu] First execution including JIT compilation took 0.22897052764892578s.
-[triton-dejavu] First execution including JIT compilation took 0.2083446979522705s.
-[triton-dejavu] First execution including JIT compilation took 0.2635183334350586s.
-[triton-dejavu] First execution including JIT compilation took 0.2431652545928955s.
-[triton-dejavu] First execution including JIT compilation took 0.20887160301208496s.
-[triton-dejavu] First execution including JIT compilation took 0.29308581352233887s.
-[triton-dejavu] First execution including JIT compilation took 0.25936341285705566s.
-[triton-dejavu] First execution including JIT compilation took 0.22800016403198242s.
-[triton-dejavu] First execution including JIT compilation took 0.24914908409118652s.
-[triton-dejavu] First execution including JIT compilation took 0.21156573295593262s.
-[triton-dejavu] First execution including JIT compilation took 0.19665884971618652s.
-[triton-dejavu] First execution including JIT compilation took 0.2679252624511719s.
-[triton-dejavu] First execution including JIT compilation took 0.23529791831970215s.
-[triton-dejavu] First execution including JIT compilation took 0.21185588836669922s.
-[triton-dejavu] First execution including JIT compilation took 0.27913832664489746s.
-[triton-dejavu] First execution including JIT compilation took 0.2429966926574707s.
-[triton-dejavu] First execution including JIT compilation took 0.22402739524841309s.
-[triton-dejavu] First execution including JIT compilation took 0.28855395317077637s.
-[triton-dejavu] First execution including JIT compilation took 0.25331878662109375s.
-[triton-dejavu] First execution including JIT compilation took 0.2378528118133545s.
-[triton-dejavu] First execution including JIT compilation took 0.30858898162841797s.
-[triton-dejavu] First execution including JIT compilation took 0.2576262950897217s.
-[triton-dejavu] First execution including JIT compilation took 0.23561835289001465s.
-[triton-dejavu] First execution including JIT compilation took 0.3155839443206787s.
-[triton-dejavu] First execution including JIT compilation took 0.26711153984069824s.
-[triton-dejavu] First execution including JIT compilation took 0.241225004196167s.
-[triton-dejavu] First execution including JIT compilation took 0.3228023052215576s.
-[triton-dejavu] First execution including JIT compilation took 0.2799217700958252s.
-[triton-dejavu] First execution including JIT compilation took 0.2530026435852051s.
-[triton-dejavu] First execution including JIT compilation took 0.2789144515991211s.
-[triton-dejavu] First execution including JIT compilation took 0.23478245735168457s.
-[triton-dejavu] First execution including JIT compilation took 0.20740914344787598s.
-[triton-dejavu] First execution including JIT compilation took 0.29624366760253906s.
-[triton-dejavu] First execution including JIT compilation took 0.24002695083618164s.
-[triton-dejavu] First execution including JIT compilation took 0.21425414085388184s.
-[triton-dejavu] First execution including JIT compilation took 0.3217945098876953s.
-[triton-dejavu] First execution including JIT compilation took 0.2628786563873291s.
-[triton-dejavu] First execution including JIT compilation took 0.22865581512451172s.
-[triton-dejavu] First execution including JIT compilation took 0.3735010623931885s.
-[triton-dejavu] First execution including JIT compilation took 0.27600812911987305s.
-[triton-dejavu] First execution including JIT compilation took 0.24083590507507324s.
-[triton-dejavu] First execution including JIT compilation took 0.37182092666625977s.
-[triton-dejavu] First execution including JIT compilation took 0.2857823371887207s.
-[triton-dejavu] First execution including JIT compilation took 0.2504265308380127s.
-[triton-dejavu] First execution including JIT compilation took 0.39750146865844727s.
-[triton-dejavu] First execution including JIT compilation took 0.30864596366882324s.
-[triton-dejavu] First execution including JIT compilation took 0.2660682201385498s.
-[triton-dejavu] First execution including JIT compilation took 0.4444742202758789s.
-[triton-dejavu] First execution including JIT compilation took 0.33436083793640137s.
-[triton-dejavu] First execution including JIT compilation took 0.27864575386047363s.
-[triton-dejavu] First execution including JIT compilation took 0.3996555805206299s.
-[triton-dejavu] First execution including JIT compilation took 0.29685044288635254s.
-[triton-dejavu] First execution including JIT compilation took 0.25818443298339844s.
-[triton-dejavu] First execution including JIT compilation took 0.4117124080657959s.
-[triton-dejavu] First execution including JIT compilation took 0.3763446807861328s.
-[triton-dejavu] First execution including JIT compilation took 0.24976181983947754s.
-[triton-dejavu] First execution including JIT compilation took 0.491518497467041s.
-[triton-dejavu] First execution including JIT compilation took 0.3266887664794922s.
-[triton-dejavu] First execution including JIT compilation took 0.2635648250579834s.
-[triton-dejavu] First execution including JIT compilation took 0.5234048366546631s.
-[triton-dejavu] First execution including JIT compilation took 0.3361480236053467s.
-[triton-dejavu] First execution including JIT compilation took 0.27045202255249023s.
-[triton-dejavu] First execution including JIT compilation took 0.5627446174621582s.
-[triton-dejavu] First execution including JIT compilation took 0.356564998626709s.
-[triton-dejavu] First execution including JIT compilation took 0.288219690322876s.
-[triton-dejavu] First execution including JIT compilation took 0.5783913135528564s.
-[triton-dejavu] First execution including JIT compilation took 0.3844006061553955s.
-[triton-dejavu] First execution including JIT compilation took 0.2928352355957031s.
-[triton-dejavu] First execution including JIT compilation took 0.6556406021118164s.
-[triton-dejavu] First execution including JIT compilation took 0.41593003273010254s.
-[triton-dejavu] First execution including JIT compilation took 0.3149592876434326s.
-[triton-dejavu] First execution including JIT compilation took 0.6277866363525391s.
-[triton-dejavu] First execution including JIT compilation took 0.36339282989501953s.
-[triton-dejavu] First execution including JIT compilation took 0.27582693099975586s.
-[triton-dejavu] First execution including JIT compilation took 0.6630880832672119s.
-[triton-dejavu] First execution including JIT compilation took 0.3871643543243408s.
-[triton-dejavu] First execution including JIT compilation took 0.29570579528808594s.
-[triton-dejavu] First execution including JIT compilation took 1.1634502410888672s.
-[triton-dejavu] First execution including JIT compilation took 0.5136749744415283s.
-[triton-dejavu] First execution including JIT compilation took 0.34021830558776855s.
-[triton-dejavu] First execution including JIT compilation took 1.281764268875122s.
-[triton-dejavu] First execution including JIT compilation took 0.5489327907562256s.
-[triton-dejavu] First execution including JIT compilation took 0.36876344680786133s.
-[triton-dejavu] First execution including JIT compilation took 1.3639161586761475s.
-[triton-dejavu] First execution including JIT compilation took 0.6005148887634277s.
-[triton-dejavu] First execution including JIT compilation took 0.3901093006134033s.
-[triton-dejavu] First execution including JIT compilation took 1.4384934902191162s.
-[triton-dejavu] First execution including JIT compilation took 0.6145901679992676s.
-[triton-dejavu] First execution including JIT compilation took 0.444568395614624s.
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.257260799407959s.
-[triton-dejavu] First execution including JIT compilation took 0.21925759315490723s.
-[triton-dejavu] First execution including JIT compilation took 0.19404125213623047s.
-[triton-dejavu] First execution including JIT compilation took 0.27055978775024414s.
-[triton-dejavu] First execution including JIT compilation took 0.23495268821716309s.
-[triton-dejavu] First execution including JIT compilation took 0.2018728256225586s.
-[triton-dejavu] First execution including JIT compilation took 0.28210949897766113s.
-[triton-dejavu] First execution including JIT compilation took 0.24408984184265137s.
-[triton-dejavu] First execution including JIT compilation took 0.20690345764160156s.
-[triton-dejavu] First execution including JIT compilation took 0.2943837642669678s.
-[triton-dejavu] First execution including JIT compilation took 0.2537810802459717s.
-[triton-dejavu] First execution including JIT compilation took 0.2101125717163086s.
-[triton-dejavu] First execution including JIT compilation took 0.32114124298095703s.
-[triton-dejavu] First execution including JIT compilation took 0.2669949531555176s.
-[triton-dejavu] First execution including JIT compilation took 0.2184464931488037s.
-[triton-dejavu] First execution including JIT compilation took 0.3285989761352539s.
-[triton-dejavu] First execution including JIT compilation took 0.2849769592285156s.
-[triton-dejavu] First execution including JIT compilation took 0.22274112701416016s.
-[triton-dejavu] First execution including JIT compilation took 0.35292792320251465s.
-[triton-dejavu] First execution including JIT compilation took 0.30437779426574707s.
-[triton-dejavu] First execution including JIT compilation took 0.2387676239013672s.
-[triton-dejavu] First execution including JIT compilation took 0.30469393730163574s.
-[triton-dejavu] First execution including JIT compilation took 0.2520115375518799s.
-[triton-dejavu] First execution including JIT compilation took 0.2158830165863037s.
-[triton-dejavu] First execution including JIT compilation took 0.3266003131866455s.
-[triton-dejavu] First execution including JIT compilation took 0.3258554935455322s.
-[triton-dejavu] First execution including JIT compilation took 0.23098182678222656s.
-[triton-dejavu] First execution including JIT compilation took 0.32482099533081055s.
-[triton-dejavu] First execution including JIT compilation took 0.2595548629760742s.
-[triton-dejavu] First execution including JIT compilation took 0.22449946403503418s.
-[triton-dejavu] First execution including JIT compilation took 0.31243300437927246s.
-[triton-dejavu] First execution including JIT compilation took 0.29460978507995605s.
-[triton-dejavu] First execution including JIT compilation took 0.23943471908569336s.
-[triton-dejavu] First execution including JIT compilation took 0.33672523498535156s.
-[triton-dejavu] First execution including JIT compilation took 0.2958707809448242s.
-[triton-dejavu] First execution including JIT compilation took 0.24780011177062988s.
-[triton-dejavu] First execution including JIT compilation took 0.3855319023132324s.
-[triton-dejavu] First execution including JIT compilation took 0.31192684173583984s.
-[triton-dejavu] First execution including JIT compilation took 0.2505671977996826s.
-[triton-dejavu] First execution including JIT compilation took 0.505831241607666s.
-[triton-dejavu] First execution including JIT compilation took 0.3260018825531006s.
-[triton-dejavu] First execution including JIT compilation took 0.26883506774902344s.
-[triton-dejavu] First execution including JIT compilation took 0.3750460147857666s.
-[triton-dejavu] First execution including JIT compilation took 0.28055334091186523s.
-[triton-dejavu] First execution including JIT compilation took 0.22944951057434082s.
-[triton-dejavu] First execution including JIT compilation took 0.39789438247680664s.
-[triton-dejavu] First execution including JIT compilation took 0.29082608222961426s.
-[triton-dejavu] First execution including JIT compilation took 0.27058982849121094s.
-[triton-dejavu] First execution including JIT compilation took 0.4739367961883545s.
-[triton-dejavu] First execution including JIT compilation took 0.3220863342285156s.
-[triton-dejavu] First execution including JIT compilation took 0.26685070991516113s.
-[triton-dejavu] First execution including JIT compilation took 0.5877034664154053s.
-[triton-dejavu] First execution including JIT compilation took 0.3485877513885498s.
-[triton-dejavu] First execution including JIT compilation took 0.3099343776702881s.
-[triton-dejavu] First execution including JIT compilation took 0.5288457870483398s.
-[triton-dejavu] First execution including JIT compilation took 0.37487125396728516s.
-[triton-dejavu] First execution including JIT compilation took 0.291426420211792s.
-[triton-dejavu] First execution including JIT compilation took 0.5929708480834961s.
-[triton-dejavu] First execution including JIT compilation took 0.39226531982421875s.
-[triton-dejavu] First execution including JIT compilation took 0.30398011207580566s.
-[triton-dejavu] First execution including JIT compilation took 0.6647982597351074s.
-[triton-dejavu] First execution including JIT compilation took 0.42576146125793457s.
-[triton-dejavu] First execution including JIT compilation took 0.35259008407592773s.
-[triton-dejavu] First execution including JIT compilation took 0.6395070552825928s.
-[triton-dejavu] First execution including JIT compilation took 0.3742716312408447s.
-[triton-dejavu] First execution including JIT compilation took 0.2742881774902344s.
-[triton-dejavu] First execution including JIT compilation took 0.7004520893096924s.
-[triton-dejavu] First execution including JIT compilation took 0.3862569332122803s.
-[triton-dejavu] First execution including JIT compilation took 0.2986783981323242s.
-[triton-dejavu] First execution including JIT compilation took 1.1238834857940674s.
-[triton-dejavu] First execution including JIT compilation took 0.4608597755432129s.
-[triton-dejavu] First execution including JIT compilation took 0.34810447692871094s.
-[triton-dejavu] First execution including JIT compilation took 1.2104661464691162s.
-[triton-dejavu] First execution including JIT compilation took 0.4885828495025635s.
-[triton-dejavu] First execution including JIT compilation took 0.32688188552856445s.
-[triton-dejavu] First execution including JIT compilation took 1.2628145217895508s.
-[triton-dejavu] First execution including JIT compilation took 0.5355226993560791s.
-[triton-dejavu] First execution including JIT compilation took 0.3450770378112793s.
-[triton-dejavu] First execution including JIT compilation took 1.3520443439483643s.
-[triton-dejavu] First execution including JIT compilation took 0.5808374881744385s.
-[triton-dejavu] First execution including JIT compilation took 0.3824028968811035s.
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.3000195026397705s.
-[triton-dejavu] First execution including JIT compilation took 0.6455490589141846s.
-[triton-dejavu] First execution including JIT compilation took 0.37666845321655273s.
-[triton-dejavu] First execution including JIT compilation took 1.3013911247253418s.
-[triton-dejavu] First execution including JIT compilation took 0.6704864501953125s.
-[triton-dejavu] First execution including JIT compilation took 0.4055519104003906s.
-[triton-dejavu] First execution including JIT compilation took 5.386528968811035s.
-[triton-dejavu] First execution including JIT compilation took 1.1679251194000244s.
-[triton-dejavu] First execution including JIT compilation took 0.5064456462860107s.
-[triton-dejavu] First execution including JIT compilation took 5.529943466186523s.
-[triton-dejavu] First execution including JIT compilation took 1.0479810237884521s.
-[triton-dejavu] First execution including JIT compilation took 0.61952805519104s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.3253762722015381s.
-[triton-dejavu] First execution including JIT compilation took 0.25934815406799316s.
-[triton-dejavu] First execution including JIT compilation took 0.21407032012939453s.
-[triton-dejavu] First execution including JIT compilation took 0.3370664119720459s.
-[triton-dejavu] First execution including JIT compilation took 0.28847193717956543s.
-[triton-dejavu] First execution including JIT compilation took 0.23705148696899414s.
-[triton-dejavu] First execution including JIT compilation took 0.3955824375152588s.
-[triton-dejavu] First execution including JIT compilation took 0.3063540458679199s.
-[triton-dejavu] First execution including JIT compilation took 0.25420117378234863s.
-[triton-dejavu] First execution including JIT compilation took 0.4245755672454834s.
-[triton-dejavu] First execution including JIT compilation took 0.32430315017700195s.
-[triton-dejavu] First execution including JIT compilation took 0.269136905670166s.
-[triton-dejavu] First execution including JIT compilation took 0.4198753833770752s.
-[triton-dejavu] First execution including JIT compilation took 0.3307523727416992s.
-[triton-dejavu] First execution including JIT compilation took 0.2702212333679199s.
-[triton-dejavu] First execution including JIT compilation took 0.43901705741882324s.
-[triton-dejavu] First execution including JIT compilation took 0.32950735092163086s.
-[triton-dejavu] First execution including JIT compilation took 0.270906925201416s.
-[triton-dejavu] First execution including JIT compilation took 0.4650428295135498s.
-[triton-dejavu] First execution including JIT compilation took 0.35302281379699707s.
-[triton-dejavu] First execution including JIT compilation took 0.2779378890991211s.
-[triton-dejavu] First execution including JIT compilation took 0.4243738651275635s.
-[triton-dejavu] First execution including JIT compilation took 0.35367321968078613s.
-[triton-dejavu] First execution including JIT compilation took 0.24160146713256836s.
-[triton-dejavu] First execution including JIT compilation took 0.4461050033569336s.
-[triton-dejavu] First execution including JIT compilation took 0.313490629196167s.
-[triton-dejavu] First execution including JIT compilation took 0.25301170349121094s.
-[triton-dejavu] First execution including JIT compilation took 0.49121928215026855s.
-[triton-dejavu] First execution including JIT compilation took 0.34533190727233887s.
-[triton-dejavu] First execution including JIT compilation took 0.2868044376373291s.
-[triton-dejavu] First execution including JIT compilation took 0.5529248714447021s.
-[triton-dejavu] First execution including JIT compilation took 0.36022305488586426s.
-[triton-dejavu] First execution including JIT compilation took 0.2978227138519287s.
-[triton-dejavu] First execution including JIT compilation took 0.5883200168609619s.
-[triton-dejavu] First execution including JIT compilation took 0.3919060230255127s.
-[triton-dejavu] First execution including JIT compilation took 0.3099555969238281s.
-[triton-dejavu] First execution including JIT compilation took 0.6135001182556152s.
-[triton-dejavu] First execution including JIT compilation took 0.4088590145111084s.
-[triton-dejavu] First execution including JIT compilation took 0.3194131851196289s.
-[triton-dejavu] First execution including JIT compilation took 0.7036430835723877s.
-[triton-dejavu] First execution including JIT compilation took 0.45201659202575684s.
-[triton-dejavu] First execution including JIT compilation took 0.3642005920410156s.
-[triton-dejavu] First execution including JIT compilation took 0.6664960384368896s.
-[triton-dejavu] First execution including JIT compilation took 0.5301604270935059s.
-[triton-dejavu] First execution including JIT compilation took 0.28510594367980957s.
-[triton-dejavu] First execution including JIT compilation took 0.6785211563110352s.
-[triton-dejavu] First execution including JIT compilation took 0.4087221622467041s.
-[triton-dejavu] First execution including JIT compilation took 0.3024318218231201s.
-[triton-dejavu] First execution including JIT compilation took 1.157663106918335s.
-[triton-dejavu] First execution including JIT compilation took 0.47170138359069824s.
-[triton-dejavu] First execution including JIT compilation took 0.32757043838500977s.
-[triton-dejavu] First execution including JIT compilation took 1.209216833114624s.
-[triton-dejavu] First execution including JIT compilation took 0.49109315872192383s.
-[triton-dejavu] First execution including JIT compilation took 0.33685898780822754s.
-[triton-dejavu] First execution including JIT compilation took 1.275850772857666s.
-[triton-dejavu] First execution including JIT compilation took 4.450741529464722s.
-[triton-dejavu] First execution including JIT compilation took 0.3839271068572998s.
-[triton-dejavu] First execution including JIT compilation took 1.3883130550384521s.
-[triton-dejavu] First execution including JIT compilation took 0.6234548091888428s.
-[triton-dejavu] First execution including JIT compilation took 0.43517351150512695s.
-bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.4472169876098633s.
-[triton-dejavu] First execution including JIT compilation took 0.6626391410827637s.
-[triton-dejavu] First execution including JIT compilation took 0.42397499084472656s.
-[triton-dejavu] First execution including JIT compilation took 1.535116195678711s.
-[triton-dejavu] First execution including JIT compilation took 0.6942274570465088s.
-[triton-dejavu] First execution including JIT compilation took 0.4427521228790283s.
-[triton-dejavu] First execution including JIT compilation took 5.385616302490234s.
-[triton-dejavu] First execution including JIT compilation took 1.1808125972747803s.
-[triton-dejavu] First execution including JIT compilation took 0.470653772354126s.
-[triton-dejavu] First execution including JIT compilation took 5.212445974349976s.
-[triton-dejavu] First execution including JIT compilation took 1.0829904079437256s.
-[triton-dejavu] First execution including JIT compilation took 0.42938661575317383s.
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.89522385597229s.
-[triton-dejavu] First execution including JIT compilation took 1.1873795986175537s.
-[triton-dejavu] First execution including JIT compilation took 0.5484645366668701s.
-[triton-dejavu] First execution including JIT compilation took 3.8715012073516846s.
-[triton-dejavu] First execution including JIT compilation took 1.5014572143554688s.
-[triton-dejavu] First execution including JIT compilation took 0.5261876583099365s.
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.16099905967712402s.
-[triton-dejavu] First execution including JIT compilation took 0.16479063034057617s.
-[triton-dejavu] First execution including JIT compilation took 0.16184067726135254s.
-[triton-dejavu] First execution including JIT compilation took 0.18947720527648926s.
-[triton-dejavu] First execution including JIT compilation took 0.17523646354675293s.
-[triton-dejavu] First execution including JIT compilation took 0.20532774925231934s.
-[triton-dejavu] First execution including JIT compilation took 0.3007838726043701s.
-[triton-dejavu] First execution including JIT compilation took 0.2070615291595459s.
-[triton-dejavu] First execution including JIT compilation took 0.19569897651672363s.
-[triton-dejavu] First execution including JIT compilation took 0.22059941291809082s.
-[triton-dejavu] First execution including JIT compilation took 0.2129993438720703s.
-[triton-dejavu] First execution including JIT compilation took 0.19128966331481934s.
-[triton-dejavu] First execution including JIT compilation took 0.23188138008117676s.
-[triton-dejavu] First execution including JIT compilation took 0.23681974411010742s.
-[triton-dejavu] First execution including JIT compilation took 0.20053791999816895s.
-[triton-dejavu] First execution including JIT compilation took 0.23547863960266113s.
-[triton-dejavu] First execution including JIT compilation took 0.24089622497558594s.
-[triton-dejavu] First execution including JIT compilation took 0.20086455345153809s.
-[triton-dejavu] First execution including JIT compilation took 0.2673182487487793s.
-[triton-dejavu] First execution including JIT compilation took 0.2323765754699707s.
-[triton-dejavu] First execution including JIT compilation took 0.20696759223937988s.
-[triton-dejavu] First execution including JIT compilation took 0.2162766456604004s.
-[triton-dejavu] First execution including JIT compilation took 0.19398021697998047s.
-[triton-dejavu] First execution including JIT compilation took 0.19721055030822754s.
-[triton-dejavu] First execution including JIT compilation took 0.2597033977508545s.
-[triton-dejavu] First execution including JIT compilation took 0.21506524085998535s.
-[triton-dejavu] First execution including JIT compilation took 0.19923901557922363s.
-[triton-dejavu] First execution including JIT compilation took 0.23490643501281738s.
-[triton-dejavu] First execution including JIT compilation took 0.21814656257629395s.
-[triton-dejavu] First execution including JIT compilation took 0.2114255428314209s.
-[triton-dejavu] First execution including JIT compilation took 0.25490689277648926s.
-[triton-dejavu] First execution including JIT compilation took 0.23802495002746582s.
-[triton-dejavu] First execution including JIT compilation took 0.2209463119506836s.
-[triton-dejavu] First execution including JIT compilation took 0.26262378692626953s.
-[triton-dejavu] First execution including JIT compilation took 0.24033474922180176s.
-[triton-dejavu] First execution including JIT compilation took 0.22833895683288574s.
-[triton-dejavu] First execution including JIT compilation took 0.27592968940734863s.
-[triton-dejavu] First execution including JIT compilation took 0.30753660202026367s.
-[triton-dejavu] First execution including JIT compilation took 0.24009227752685547s.
-[triton-dejavu] First execution including JIT compilation took 0.3198988437652588s.
-[triton-dejavu] First execution including JIT compilation took 0.2536334991455078s.
-[triton-dejavu] First execution including JIT compilation took 0.24361896514892578s.
-[triton-dejavu] First execution including JIT compilation took 0.25406312942504883s.
-[triton-dejavu] First execution including JIT compilation took 0.20447945594787598s.
-[triton-dejavu] First execution including JIT compilation took 0.19324684143066406s.
-[triton-dejavu] First execution including JIT compilation took 0.2655496597290039s.
-[triton-dejavu] First execution including JIT compilation took 0.2157602310180664s.
-[triton-dejavu] First execution including JIT compilation took 0.21718740463256836s.
-[triton-dejavu] First execution including JIT compilation took 0.2685074806213379s.
-[triton-dejavu] First execution including JIT compilation took 0.22944235801696777s.
-[triton-dejavu] First execution including JIT compilation took 0.26235318183898926s.
-[triton-dejavu] First execution including JIT compilation took 0.2784006595611572s.
-[triton-dejavu] First execution including JIT compilation took 0.2505967617034912s.
-[triton-dejavu] First execution including JIT compilation took 0.2174668312072754s.
-[triton-dejavu] First execution including JIT compilation took 0.3032703399658203s.
-[triton-dejavu] First execution including JIT compilation took 0.24589204788208008s.
-[triton-dejavu] First execution including JIT compilation took 0.24153470993041992s.
-[triton-dejavu] First execution including JIT compilation took 0.3123292922973633s.
-[triton-dejavu] First execution including JIT compilation took 0.25666284561157227s.
-[triton-dejavu] First execution including JIT compilation took 0.24495959281921387s.
-[triton-dejavu] First execution including JIT compilation took 0.40181756019592285s.
-[triton-dejavu] First execution including JIT compilation took 0.2874319553375244s.
-[triton-dejavu] First execution including JIT compilation took 0.26238536834716797s.
-[triton-dejavu] First execution including JIT compilation took 0.29593992233276367s.
-[triton-dejavu] First execution including JIT compilation took 0.2483835220336914s.
-[triton-dejavu] First execution including JIT compilation took 0.21873855590820312s.
-[triton-dejavu] First execution including JIT compilation took 0.3240337371826172s.
-[triton-dejavu] First execution including JIT compilation took 0.27186155319213867s.
-[triton-dejavu] First execution including JIT compilation took 0.2367856502532959s.
-[triton-dejavu] First execution including JIT compilation took 0.36632680892944336s.
-[triton-dejavu] First execution including JIT compilation took 0.28855419158935547s.
-[triton-dejavu] First execution including JIT compilation took 0.252063512802124s.
-[triton-dejavu] First execution including JIT compilation took 0.3803398609161377s.
-[triton-dejavu] First execution including JIT compilation took 0.3005537986755371s.
-[triton-dejavu] First execution including JIT compilation took 0.2524375915527344s.
-[triton-dejavu] First execution including JIT compilation took 0.39989233016967773s.
-[triton-dejavu] First execution including JIT compilation took 0.30489444732666016s.
-[triton-dejavu] First execution including JIT compilation took 0.3460044860839844s.
-[triton-dejavu] First execution including JIT compilation took 0.4346785545349121s.
-[triton-dejavu] First execution including JIT compilation took 0.31057190895080566s.
-[triton-dejavu] First execution including JIT compilation took 0.2753105163574219s.
-[triton-dejavu] First execution including JIT compilation took 0.4804239273071289s.
-[triton-dejavu] First execution including JIT compilation took 0.6045436859130859s.
-[triton-dejavu] First execution including JIT compilation took 0.5239622592926025s.
-[triton-dejavu] First execution including JIT compilation took 0.42328929901123047s.
-[triton-dejavu] First execution including JIT compilation took 0.29734230041503906s.
-[triton-dejavu] First execution including JIT compilation took 0.25474023818969727s.
-[triton-dejavu] First execution including JIT compilation took 0.4383995532989502s.
-[triton-dejavu] First execution including JIT compilation took 0.317385196685791s.
-[triton-dejavu] First execution including JIT compilation took 0.2737693786621094s.
-[triton-dejavu] First execution including JIT compilation took 0.5553841590881348s.
-[triton-dejavu] First execution including JIT compilation took 0.35834789276123047s.
-[triton-dejavu] First execution including JIT compilation took 0.33359360694885254s.
-[triton-dejavu] First execution including JIT compilation took 0.6015679836273193s.
-[triton-dejavu] First execution including JIT compilation took 0.3901402950286865s.
-[triton-dejavu] First execution including JIT compilation took 0.5647265911102295s.
-[triton-dejavu] First execution including JIT compilation took 0.6674647331237793s.
-[triton-dejavu] First execution including JIT compilation took 0.4167792797088623s.
-[triton-dejavu] First execution including JIT compilation took 0.3413257598876953s.
-[triton-dejavu] First execution including JIT compilation took 0.7596611976623535s.
-[triton-dejavu] First execution including JIT compilation took 0.43511199951171875s.
-[triton-dejavu] First execution including JIT compilation took 0.3729708194732666s.
-[triton-dejavu] First execution including JIT compilation took 0.7846958637237549s.
-[triton-dejavu] First execution including JIT compilation took 0.6946334838867188s.
-[triton-dejavu] First execution including JIT compilation took 0.433488130569458s.
-[triton-dejavu] First execution including JIT compilation took 0.4143795967102051s.
-[triton-dejavu] First execution including JIT compilation took 0.19273090362548828s.
-[triton-dejavu] First execution including JIT compilation took 0.1894831657409668s.
-[triton-dejavu] First execution including JIT compilation took 0.21772050857543945s.
-[triton-dejavu] First execution including JIT compilation took 0.20870161056518555s.
-[triton-dejavu] First execution including JIT compilation took 0.19836974143981934s.
-[triton-dejavu] First execution including JIT compilation took 0.23154735565185547s.
-[triton-dejavu] First execution including JIT compilation took 0.20255732536315918s.
-[triton-dejavu] First execution including JIT compilation took 0.20589423179626465s.
-[triton-dejavu] First execution including JIT compilation took 0.23143506050109863s.
-[triton-dejavu] First execution including JIT compilation took 0.2238476276397705s.
-[triton-dejavu] First execution including JIT compilation took 0.22137689590454102s.
-[triton-dejavu] First execution including JIT compilation took 0.23688888549804688s.
-[triton-dejavu] First execution including JIT compilation took 0.22747373580932617s.
-[triton-dejavu] First execution including JIT compilation took 0.21170425415039062s.
-[triton-dejavu] First execution including JIT compilation took 0.25104713439941406s.
-[triton-dejavu] First execution including JIT compilation took 0.23560285568237305s.
-[triton-dejavu] First execution including JIT compilation took 0.22189855575561523s.
-[triton-dejavu] First execution including JIT compilation took 0.26841211318969727s.
-[triton-dejavu] First execution including JIT compilation took 0.24857425689697266s.
-[triton-dejavu] First execution including JIT compilation took 0.24297785758972168s.
-[triton-dejavu] First execution including JIT compilation took 0.22948384284973145s.
-[triton-dejavu] First execution including JIT compilation took 0.1988661289215088s.
-[triton-dejavu] First execution including JIT compilation took 0.1896975040435791s.
-[triton-dejavu] First execution including JIT compilation took 0.24746084213256836s.
-[triton-dejavu] First execution including JIT compilation took 0.2185649871826172s.
-[triton-dejavu] First execution including JIT compilation took 0.2041003704071045s.
-[triton-dejavu] First execution including JIT compilation took 0.2650284767150879s.
-[triton-dejavu] First execution including JIT compilation took 0.29841017723083496s.
-[triton-dejavu] First execution including JIT compilation took 0.22010397911071777s.
-[triton-dejavu] First execution including JIT compilation took 0.29136109352111816s.
-[triton-dejavu] First execution including JIT compilation took 0.24800348281860352s.
-[triton-dejavu] First execution including JIT compilation took 0.22483563423156738s.
-[triton-dejavu] First execution including JIT compilation took 0.27678871154785156s.
-[triton-dejavu] First execution including JIT compilation took 0.24335885047912598s.
-[triton-dejavu] First execution including JIT compilation took 0.23481535911560059s.
-[triton-dejavu] First execution including JIT compilation took 0.288956880569458s.
-[triton-dejavu] First execution including JIT compilation took 0.27408528327941895s.
-[triton-dejavu] First execution including JIT compilation took 0.23581624031066895s.
-[triton-dejavu] First execution including JIT compilation took 0.4908866882324219s.
-[triton-dejavu] First execution including JIT compilation took 0.40228939056396484s.
-[triton-dejavu] First execution including JIT compilation took 0.24046826362609863s.
-[triton-dejavu] First execution including JIT compilation took 0.26389145851135254s.
-[triton-dejavu] First execution including JIT compilation took 0.22148394584655762s.
-[triton-dejavu] First execution including JIT compilation took 0.5583405494689941s.
-[triton-dejavu] First execution including JIT compilation took 0.2856779098510742s.
-[triton-dejavu] First execution including JIT compilation took 0.2353372573852539s.
-[triton-dejavu] First execution including JIT compilation took 0.6925959587097168s.
-[triton-dejavu] First execution including JIT compilation took 0.3485393524169922s.
-[triton-dejavu] First execution including JIT compilation took 0.5607750415802002s.
-[triton-dejavu] First execution including JIT compilation took 0.23984003067016602s.
-[triton-dejavu] First execution including JIT compilation took 0.3213932514190674s.
-[triton-dejavu] First execution including JIT compilation took 0.6764676570892334s.
-[triton-dejavu] First execution including JIT compilation took 0.23536252975463867s.
-[triton-dejavu] First execution including JIT compilation took 0.34651637077331543s.
-[triton-dejavu] First execution including JIT compilation took 0.28470325469970703s.
-[triton-dejavu] First execution including JIT compilation took 0.31414175033569336s.
-[triton-dejavu] First execution including JIT compilation took 0.37757158279418945s.
-[triton-dejavu] First execution including JIT compilation took 0.5653142929077148s.
-[triton-dejavu] First execution including JIT compilation took 0.26725172996520996s.
-[triton-dejavu] First execution including JIT compilation took 0.41627001762390137s.
-[triton-dejavu] First execution including JIT compilation took 0.31640172004699707s.
-[triton-dejavu] First execution including JIT compilation took 0.28881263732910156s.
-[triton-dejavu] First execution including JIT compilation took 0.3590090274810791s.
-[triton-dejavu] First execution including JIT compilation took 0.2639732360839844s.
-[triton-dejavu] First execution including JIT compilation took 0.2286384105682373s.
-[triton-dejavu] First execution including JIT compilation took 0.3730354309082031s.
-[triton-dejavu] First execution including JIT compilation took 0.2863786220550537s.
-[triton-dejavu] First execution including JIT compilation took 0.2942509651184082s.
-[triton-dejavu] First execution including JIT compilation took 0.39931154251098633s.
-[triton-dejavu] First execution including JIT compilation took 0.29895520210266113s.
-[triton-dejavu] First execution including JIT compilation took 0.25076842308044434s.
-[triton-dejavu] First execution including JIT compilation took 0.44957423210144043s.
-[triton-dejavu] First execution including JIT compilation took 0.3334476947784424s.
-[triton-dejavu] First execution including JIT compilation took 0.2527899742126465s.
-[triton-dejavu] First execution including JIT compilation took 0.4636514186859131s.
-[triton-dejavu] First execution including JIT compilation took 0.3361804485321045s.
-[triton-dejavu] First execution including JIT compilation took 0.27211809158325195s.
-[triton-dejavu] First execution including JIT compilation took 0.49650144577026367s.
-[triton-dejavu] First execution including JIT compilation took 0.3559887409210205s.
-[triton-dejavu] First execution including JIT compilation took 0.28218793869018555s.
-[triton-dejavu] First execution including JIT compilation took 0.5615448951721191s.
-[triton-dejavu] First execution including JIT compilation took 0.7608671188354492s.
-[triton-dejavu] First execution including JIT compilation took 0.3274657726287842s.
-[triton-dejavu] First execution including JIT compilation took 0.5771064758300781s.
-[triton-dejavu] First execution including JIT compilation took 0.3666727542877197s.
-[triton-dejavu] First execution including JIT compilation took 0.2816441059112549s.
-[triton-dejavu] First execution including JIT compilation took 0.5920066833496094s.
-[triton-dejavu] First execution including JIT compilation took 0.39211297035217285s.
-[triton-dejavu] First execution including JIT compilation took 0.3664851188659668s.
-[triton-dejavu] First execution including JIT compilation took 0.7597804069519043s.
-[triton-dejavu] First execution including JIT compilation took 0.4436652660369873s.
-[triton-dejavu] First execution including JIT compilation took 0.3279578685760498s.
-[triton-dejavu] First execution including JIT compilation took 0.8084597587585449s.
-[triton-dejavu] First execution including JIT compilation took 0.4614524841308594s.
-[triton-dejavu] First execution including JIT compilation took 0.33185672760009766s.
-[triton-dejavu] First execution including JIT compilation took 0.8718667030334473s.
-[triton-dejavu] First execution including JIT compilation took 0.47515869140625s.
-[triton-dejavu] First execution including JIT compilation took 0.34561920166015625s.
-[triton-dejavu] First execution including JIT compilation took 0.9526774883270264s.
-[triton-dejavu] First execution including JIT compilation took 0.5167005062103271s.
-[triton-dejavu] First execution including JIT compilation took 0.36385536193847656s.
-[triton-dejavu] First execution including JIT compilation took 1.0977909564971924s.
-[triton-dejavu] First execution including JIT compilation took 0.5700552463531494s.
-[triton-dejavu] First execution including JIT compilation took 0.39052820205688477s.
-[triton-dejavu] First execution including JIT compilation took 0.24593067169189453s.
-[triton-dejavu] First execution including JIT compilation took 0.20725393295288086s.
-[triton-dejavu] First execution including JIT compilation took 0.20369458198547363s.
-[triton-dejavu] First execution including JIT compilation took 0.2673497200012207s.
-[triton-dejavu] First execution including JIT compilation took 0.22520208358764648s.
-[triton-dejavu] First execution including JIT compilation took 0.2131955623626709s.
-[triton-dejavu] First execution including JIT compilation took 0.2753727436065674s.
-[triton-dejavu] First execution including JIT compilation took 0.23779654502868652s.
-[triton-dejavu] First execution including JIT compilation took 0.2212817668914795s.
-[triton-dejavu] First execution including JIT compilation took 0.28991055488586426s.
-[triton-dejavu] First execution including JIT compilation took 0.25224971771240234s.
-[triton-dejavu] First execution including JIT compilation took 0.22817182540893555s.
-[triton-dejavu] First execution including JIT compilation took 0.2966287136077881s.
-[triton-dejavu] First execution including JIT compilation took 0.25997209548950195s.
-[triton-dejavu] First execution including JIT compilation took 0.24083781242370605s.
-[triton-dejavu] First execution including JIT compilation took 0.3113217353820801s.
-[triton-dejavu] First execution including JIT compilation took 0.2704799175262451s.
-[triton-dejavu] First execution including JIT compilation took 0.24837803840637207s.
-[triton-dejavu] First execution including JIT compilation took 0.3431241512298584s.
-[triton-dejavu] First execution including JIT compilation took 0.28302574157714844s.
-[triton-dejavu] First execution including JIT compilation took 0.27059054374694824s.
-[triton-dejavu] First execution including JIT compilation took 0.28479981422424316s.
-[triton-dejavu] First execution including JIT compilation took 0.23545360565185547s.
-[triton-dejavu] First execution including JIT compilation took 0.21197509765625s.
-[triton-dejavu] First execution including JIT compilation took 0.29216909408569336s.
-[triton-dejavu] First execution including JIT compilation took 0.27100205421447754s.
-[triton-dejavu] First execution including JIT compilation took 0.21901345252990723s.
-[triton-dejavu] First execution including JIT compilation took 0.33516407012939453s.
-[triton-dejavu] First execution including JIT compilation took 0.24909710884094238s.
-[triton-dejavu] First execution including JIT compilation took 0.22092056274414062s.
-[triton-dejavu] First execution including JIT compilation took 0.3268437385559082s.
-[triton-dejavu] First execution including JIT compilation took 0.2536735534667969s.
-[triton-dejavu] First execution including JIT compilation took 0.22658419609069824s.
-[triton-dejavu] First execution including JIT compilation took 0.0029747486114501953s.
-[triton-dejavu] First execution including JIT compilation took 0.22745442390441895s.
-[triton-dejavu] First execution including JIT compilation took 0.20766067504882812s.
-[triton-dejavu] First execution including JIT compilation took 0.28397655487060547s.
-[triton-dejavu] First execution including JIT compilation took 0.239030122756958s.
-[triton-dejavu] First execution including JIT compilation took 0.2599983215332031s.
-[triton-dejavu] First execution including JIT compilation took 0.298583984375s.
-[triton-dejavu] First execution including JIT compilation took 0.2960634231567383s.
-[triton-dejavu] First execution including JIT compilation took 0.27265357971191406s.
-[triton-dejavu] First execution including JIT compilation took 0.3366870880126953s.
-[triton-dejavu] First execution including JIT compilation took 0.26715946197509766s.
-[triton-dejavu] First execution including JIT compilation took 0.22327065467834473s.
-[triton-dejavu] First execution including JIT compilation took 0.35770249366760254s.
-[triton-dejavu] First execution including JIT compilation took 0.28089475631713867s.
-[triton-dejavu] First execution including JIT compilation took 0.24740338325500488s.
-[triton-dejavu] First execution including JIT compilation took 0.39159536361694336s.
-[triton-dejavu] First execution including JIT compilation took 0.30934739112854004s.
-[triton-dejavu] First execution including JIT compilation took 0.31633591651916504s.
-[triton-dejavu] First execution including JIT compilation took 0.47846007347106934s.
-[triton-dejavu] First execution including JIT compilation took 0.31675219535827637s.
-[triton-dejavu] First execution including JIT compilation took 0.2682924270629883s.
-[triton-dejavu] First execution including JIT compilation took 0.4402353763580322s.
-[triton-dejavu] First execution including JIT compilation took 0.32269787788391113s.
-[triton-dejavu] First execution including JIT compilation took 0.2709183692932129s.
-[triton-dejavu] First execution including JIT compilation took 0.4444851875305176s.
-[triton-dejavu] First execution including JIT compilation took 0.3340129852294922s.
-[triton-dejavu] First execution including JIT compilation took 0.2642478942871094s.
-[triton-dejavu] First execution including JIT compilation took 0.5187058448791504s.
-[triton-dejavu] First execution including JIT compilation took 0.345888614654541s.
-[triton-dejavu] First execution including JIT compilation took 0.2869832515716553s.
-[triton-dejavu] First execution including JIT compilation took 0.4653136730194092s.
-[triton-dejavu] First execution including JIT compilation took 0.318464994430542s.
-[triton-dejavu] First execution including JIT compilation took 0.2446439266204834s.
-[triton-dejavu] First execution including JIT compilation took 0.48433685302734375s.
-[triton-dejavu] First execution including JIT compilation took 0.33881640434265137s.
-[triton-dejavu] First execution including JIT compilation took 0.2716357707977295s.
-[triton-dejavu] First execution including JIT compilation took 0.5825600624084473s.
-[triton-dejavu] First execution including JIT compilation took 0.3634026050567627s.
-[triton-dejavu] First execution including JIT compilation took 0.2927565574645996s.
-[triton-dejavu] First execution including JIT compilation took 0.6513199806213379s.
-[triton-dejavu] First execution including JIT compilation took 0.39306163787841797s.
-[triton-dejavu] First execution including JIT compilation took 0.3288865089416504s.
-[triton-dejavu] First execution including JIT compilation took 0.6803631782531738s.
-[triton-dejavu] First execution including JIT compilation took 0.4358654022216797s.
-[triton-dejavu] First execution including JIT compilation took 0.3263130187988281s.
-[triton-dejavu] First execution including JIT compilation took 0.7428200244903564s.
-[triton-dejavu] First execution including JIT compilation took 0.4704313278198242s.
-[triton-dejavu] First execution including JIT compilation took 0.3472471237182617s.
-[triton-dejavu] First execution including JIT compilation took 0.8439326286315918s.
-[triton-dejavu] First execution including JIT compilation took 0.5137937068939209s.
-[triton-dejavu] First execution including JIT compilation took 0.37453126907348633s.
-[triton-dejavu] First execution including JIT compilation took 0.8335433006286621s.
-[triton-dejavu] First execution including JIT compilation took 0.49039268493652344s.
-[triton-dejavu] First execution including JIT compilation took 0.33686327934265137s.
-[triton-dejavu] First execution including JIT compilation took 0.8961453437805176s.
-[triton-dejavu] First execution including JIT compilation took 0.4983179569244385s.
-[triton-dejavu] First execution including JIT compilation took 0.35771870613098145s.
-[triton-dejavu] First execution including JIT compilation took 1.4264824390411377s.
-[triton-dejavu] First execution including JIT compilation took 0.590933084487915s.
-[triton-dejavu] First execution including JIT compilation took 0.37283968925476074s.
-[triton-dejavu] First execution including JIT compilation took 1.513688564300537s.
-[triton-dejavu] First execution including JIT compilation took 0.6336290836334229s.
-[triton-dejavu] First execution including JIT compilation took 0.3925764560699463s.
-[triton-dejavu] First execution including JIT compilation took 1.6222319602966309s.
-[triton-dejavu] First execution including JIT compilation took 0.6844735145568848s.
-[triton-dejavu] First execution including JIT compilation took 0.4293532371520996s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.27669787406921387s.
-[triton-dejavu] First execution including JIT compilation took 0.2200946807861328s.
-[triton-dejavu] First execution including JIT compilation took 0.20387721061706543s.
-[triton-dejavu] First execution including JIT compilation took 0.28987956047058105s.
-[triton-dejavu] First execution including JIT compilation took 0.2356255054473877s.
-[triton-dejavu] First execution including JIT compilation took 0.27132534980773926s.
-[triton-dejavu] First execution including JIT compilation took 0.32961010932922363s.
-[triton-dejavu] First execution including JIT compilation took 0.30097293853759766s.
-[triton-dejavu] First execution including JIT compilation took 0.19495487213134766s.
-[triton-dejavu] First execution including JIT compilation took 0.3256947994232178s.
-[triton-dejavu] First execution including JIT compilation took 0.2637317180633545s.
-[triton-dejavu] First execution including JIT compilation took 0.20687651634216309s.
-[triton-dejavu] First execution including JIT compilation took 0.31104588508605957s.
-[triton-dejavu] First execution including JIT compilation took 0.23851871490478516s.
-[triton-dejavu] First execution including JIT compilation took 0.21181392669677734s.
-[triton-dejavu] First execution including JIT compilation took 0.31918883323669434s.
-[triton-dejavu] First execution including JIT compilation took 0.26523566246032715s.
-[triton-dejavu] First execution including JIT compilation took 0.24065852165222168s.
-[triton-dejavu] First execution including JIT compilation took 0.4031362533569336s.
-[triton-dejavu] First execution including JIT compilation took 0.32692384719848633s.
-[triton-dejavu] First execution including JIT compilation took 0.30884742736816406s.
-[triton-dejavu] First execution including JIT compilation took 0.36347198486328125s.
-[triton-dejavu] First execution including JIT compilation took 0.275341272354126s.
-[triton-dejavu] First execution including JIT compilation took 0.22562313079833984s.
-[triton-dejavu] First execution including JIT compilation took 0.3837006092071533s.
-[triton-dejavu] First execution including JIT compilation took 0.28661417961120605s.
-[triton-dejavu] First execution including JIT compilation took 0.2673346996307373s.
-[triton-dejavu] First execution including JIT compilation took 0.42246246337890625s.
-[triton-dejavu] First execution including JIT compilation took 0.3161001205444336s.
-[triton-dejavu] First execution including JIT compilation took 0.25901246070861816s.
-[triton-dejavu] First execution including JIT compilation took 0.4973328113555908s.
-[triton-dejavu] First execution including JIT compilation took 0.33356618881225586s.
-[triton-dejavu] First execution including JIT compilation took 0.27872180938720703s.
-[triton-dejavu] First execution including JIT compilation took 0.46326756477355957s.
-[triton-dejavu] First execution including JIT compilation took 0.35817837715148926s.
-[triton-dejavu] First execution including JIT compilation took 0.2817208766937256s.
-[triton-dejavu] First execution including JIT compilation took 0.49773097038269043s.
-[triton-dejavu] First execution including JIT compilation took 0.3602900505065918s.
-[triton-dejavu] First execution including JIT compilation took 0.3025212287902832s.
-[triton-dejavu] First execution including JIT compilation took 0.5235435962677002s.
-[triton-dejavu] First execution including JIT compilation took 0.3942751884460449s.
-[triton-dejavu] First execution including JIT compilation took 0.3084683418273926s.
-[triton-dejavu] First execution including JIT compilation took 0.4975898265838623s.
-[triton-dejavu] First execution including JIT compilation took 0.3797109127044678s.
-[triton-dejavu] First execution including JIT compilation took 0.30298733711242676s.
-[triton-dejavu] First execution including JIT compilation took 0.5086245536804199s.
-[triton-dejavu] First execution including JIT compilation took 0.3442721366882324s.
-[triton-dejavu] First execution including JIT compilation took 0.2747983932495117s.
-[triton-dejavu] First execution including JIT compilation took 0.5613915920257568s.
-[triton-dejavu] First execution including JIT compilation took 0.44350624084472656s.
-[triton-dejavu] First execution including JIT compilation took 0.28230857849121094s.
-[triton-dejavu] First execution including JIT compilation took 0.6058351993560791s.
-[triton-dejavu] First execution including JIT compilation took 0.38971829414367676s.
-[triton-dejavu] First execution including JIT compilation took 0.3060598373413086s.
-[triton-dejavu] First execution including JIT compilation took 0.6243553161621094s.
-[triton-dejavu] First execution including JIT compilation took 0.4053328037261963s.
-[triton-dejavu] First execution including JIT compilation took 0.3107168674468994s.
-[triton-dejavu] First execution including JIT compilation took 0.6879723072052002s.
-[triton-dejavu] First execution including JIT compilation took 0.4307887554168701s.
-[triton-dejavu] First execution including JIT compilation took 0.3229548931121826s.
-[triton-dejavu] First execution including JIT compilation took 0.7918972969055176s.
-[triton-dejavu] First execution including JIT compilation took 0.48616766929626465s.
-[triton-dejavu] First execution including JIT compilation took 0.36690473556518555s.
-[triton-dejavu] First execution including JIT compilation took 0.8196022510528564s.
-[triton-dejavu] First execution including JIT compilation took 0.4654698371887207s.
-[triton-dejavu] First execution including JIT compilation took 0.3202695846557617s.
-[triton-dejavu] First execution including JIT compilation took 0.7949032783508301s.
-[triton-dejavu] First execution including JIT compilation took 0.4816138744354248s.
-[triton-dejavu] First execution including JIT compilation took 0.3490898609161377s.
-[triton-dejavu] First execution including JIT compilation took 1.313990831375122s.
-[triton-dejavu] First execution including JIT compilation took 0.6957395076751709s.
-[triton-dejavu] First execution including JIT compilation took 0.36670374870300293s.
-[triton-dejavu] First execution including JIT compilation took 1.390000581741333s.
-[triton-dejavu] First execution including JIT compilation took 0.5718057155609131s.
-[triton-dejavu] First execution including JIT compilation took 0.4198739528656006s.
-[triton-dejavu] First execution including JIT compilation took 1.4662723541259766s.
-[triton-dejavu] First execution including JIT compilation took 0.6175658702850342s.
-[triton-dejavu] First execution including JIT compilation took 0.398082971572876s.
-[triton-dejavu] First execution including JIT compilation took 1.5167500972747803s.
-[triton-dejavu] First execution including JIT compilation took 0.6479494571685791s.
-[triton-dejavu] First execution including JIT compilation took 0.41569066047668457s.
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.4222400188446045s.
-[triton-dejavu] First execution including JIT compilation took 0.5973012447357178s.
-[triton-dejavu] First execution including JIT compilation took 0.3583037853240967s.
-[triton-dejavu] First execution including JIT compilation took 1.356217861175537s.
-[triton-dejavu] First execution including JIT compilation took 0.6360659599304199s.
-[triton-dejavu] First execution including JIT compilation took 0.4930713176727295s.
-[triton-dejavu] First execution including JIT compilation took 5.760070323944092s.
-[triton-dejavu] First execution including JIT compilation took 1.333890438079834s.
-[triton-dejavu] First execution including JIT compilation took 0.5843362808227539s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.4048187732696533s.
-[triton-dejavu] First execution including JIT compilation took 0.2757749557495117s.
-[triton-dejavu] First execution including JIT compilation took 0.22593021392822266s.
-[triton-dejavu] First execution including JIT compilation took 0.3719668388366699s.
-[triton-dejavu] First execution including JIT compilation took 0.2907881736755371s.
-[triton-dejavu] First execution including JIT compilation took 0.22260475158691406s.
-[triton-dejavu] First execution including JIT compilation took 0.3940160274505615s.
-[triton-dejavu] First execution including JIT compilation took 0.29627037048339844s.
-[triton-dejavu] First execution including JIT compilation took 0.23922204971313477s.
-[triton-dejavu] First execution including JIT compilation took 0.4309115409851074s.
-[triton-dejavu] First execution including JIT compilation took 0.3773322105407715s.
-[triton-dejavu] First execution including JIT compilation took 0.31682252883911133s.
-[triton-dejavu] First execution including JIT compilation took 0.4817657470703125s.
-[triton-dejavu] First execution including JIT compilation took 0.30999183654785156s.
-[triton-dejavu] First execution including JIT compilation took 0.2649409770965576s.
-[triton-dejavu] First execution including JIT compilation took 0.4654359817504883s.
-[triton-dejavu] First execution including JIT compilation took 0.3404858112335205s.
-[triton-dejavu] First execution including JIT compilation took 0.2549777030944824s.
-[triton-dejavu] First execution including JIT compilation took 0.5529801845550537s.
-[triton-dejavu] First execution including JIT compilation took 0.39357733726501465s.
-[triton-dejavu] First execution including JIT compilation took 0.2779700756072998s.
-[triton-dejavu] First execution including JIT compilation took 0.4679603576660156s.
-[triton-dejavu] First execution including JIT compilation took 0.3258848190307617s.
-[triton-dejavu] First execution including JIT compilation took 0.22054314613342285s.
-[triton-dejavu] First execution including JIT compilation took 0.5082552433013916s.
-[triton-dejavu] First execution including JIT compilation took 0.33693814277648926s.
-[triton-dejavu] First execution including JIT compilation took 0.2745835781097412s.
-[triton-dejavu] First execution including JIT compilation took 0.5847163200378418s.
-[triton-dejavu] First execution including JIT compilation took 0.33575940132141113s.
-[triton-dejavu] First execution including JIT compilation took 0.3060939311981201s.
-[triton-dejavu] First execution including JIT compilation took 0.5239126682281494s.
-[triton-dejavu] First execution including JIT compilation took 0.35081052780151367s.
-[triton-dejavu] First execution including JIT compilation took 0.2809262275695801s.
-[triton-dejavu] First execution including JIT compilation took 0.5589377880096436s.
-[triton-dejavu] First execution including JIT compilation took 0.36190342903137207s.
-[triton-dejavu] First execution including JIT compilation took 0.27885007858276367s.
-[triton-dejavu] First execution including JIT compilation took 0.6101348400115967s.
-[triton-dejavu] First execution including JIT compilation took 0.4172549247741699s.
-[triton-dejavu] First execution including JIT compilation took 0.3286736011505127s.
-[triton-dejavu] First execution including JIT compilation took 0.6457531452178955s.
-[triton-dejavu] First execution including JIT compilation took 0.39678049087524414s.
-[triton-dejavu] First execution including JIT compilation took 0.30982041358947754s.
-[triton-dejavu] First execution including JIT compilation took 0.6744742393493652s.
-[triton-dejavu] First execution including JIT compilation took 0.42897796630859375s.
-[triton-dejavu] First execution including JIT compilation took 0.26523256301879883s.
-[triton-dejavu] First execution including JIT compilation took 0.7656145095825195s.
-[triton-dejavu] First execution including JIT compilation took 0.40720272064208984s.
-[triton-dejavu] First execution including JIT compilation took 0.28449296951293945s.
-[triton-dejavu] First execution including JIT compilation took 1.1293773651123047s.
-[triton-dejavu] First execution including JIT compilation took 0.5252115726470947s.
-[triton-dejavu] First execution including JIT compilation took 0.3610687255859375s.
-[triton-dejavu] First execution including JIT compilation took 1.4119246006011963s.
-[triton-dejavu] First execution including JIT compilation took 0.6041393280029297s.
-[triton-dejavu] First execution including JIT compilation took 0.3884885311126709s.
-[triton-dejavu] First execution including JIT compilation took 1.526637315750122s.
-[triton-dejavu] First execution including JIT compilation took 0.6266424655914307s.
-[triton-dejavu] First execution including JIT compilation took 0.40192389488220215s.
-[triton-dejavu] First execution including JIT compilation took 1.530601978302002s.
-[triton-dejavu] First execution including JIT compilation took 0.6515090465545654s.
-[triton-dejavu] First execution including JIT compilation took 0.42690610885620117s.
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.7079706192016602s.
-[triton-dejavu] First execution including JIT compilation took 0.7440791130065918s.
-[triton-dejavu] First execution including JIT compilation took 0.4444162845611572s.
-[triton-dejavu] First execution including JIT compilation took 1.7328886985778809s.
-[triton-dejavu] First execution including JIT compilation took 0.7971758842468262s.
-[triton-dejavu] First execution including JIT compilation took 0.47760677337646484s.
-[triton-dejavu] First execution including JIT compilation took 5.828885316848755s.
-[triton-dejavu] First execution including JIT compilation took 1.288949966430664s.
-[triton-dejavu] First execution including JIT compilation took 0.5151238441467285s.
-[triton-dejavu] First execution including JIT compilation took 5.60798192024231s.
-[triton-dejavu] First execution including JIT compilation took 1.3259506225585938s.
-[triton-dejavu] First execution including JIT compilation took 0.5839335918426514s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 3.490701913833618s.
-[triton-dejavu] First execution including JIT compilation took 1.3819916248321533s.
-[triton-dejavu] First execution including JIT compilation took 0.681626558303833s.
-[triton-dejavu] First execution including JIT compilation took 4.5987389087677s.
-[triton-dejavu] First execution including JIT compilation took 1.3767080307006836s.
-[triton-dejavu] First execution including JIT compilation took 0.6134452819824219s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.20271754264831543s.
-[triton-dejavu] First execution including JIT compilation took 0.17133116722106934s.
-[triton-dejavu] First execution including JIT compilation took 0.1536731719970703s.
-[triton-dejavu] First execution including JIT compilation took 0.19096851348876953s.
-[triton-dejavu] First execution including JIT compilation took 0.17846441268920898s.
-[triton-dejavu] First execution including JIT compilation took 0.1891782283782959s.
-[triton-dejavu] First execution including JIT compilation took 0.23286938667297363s.
-[triton-dejavu] First execution including JIT compilation took 0.21250367164611816s.
-[triton-dejavu] First execution including JIT compilation took 0.2115018367767334s.
-[triton-dejavu] First execution including JIT compilation took 0.25536513328552246s.
-[triton-dejavu] First execution including JIT compilation took 0.26403188705444336s.
-[triton-dejavu] First execution including JIT compilation took 0.20061635971069336s.
-[triton-dejavu] First execution including JIT compilation took 0.27070093154907227s.
-[triton-dejavu] First execution including JIT compilation took 0.2410728931427002s.
-[triton-dejavu] First execution including JIT compilation took 0.2050936222076416s.
-[triton-dejavu] First execution including JIT compilation took 0.28530001640319824s.
-[triton-dejavu] First execution including JIT compilation took 0.24301719665527344s.
-[triton-dejavu] First execution including JIT compilation took 0.2149193286895752s.
-[triton-dejavu] First execution including JIT compilation took 0.29409146308898926s.
-[triton-dejavu] First execution including JIT compilation took 0.2827568054199219s.
-[triton-dejavu] First execution including JIT compilation took 0.2236497402191162s.
-[triton-dejavu] First execution including JIT compilation took 0.2511780261993408s.
-[triton-dejavu] First execution including JIT compilation took 0.21310901641845703s.
-[triton-dejavu] First execution including JIT compilation took 0.19583463668823242s.
-[triton-dejavu] First execution including JIT compilation took 0.270064115524292s.
-[triton-dejavu] First execution including JIT compilation took 0.2317502498626709s.
-[triton-dejavu] First execution including JIT compilation took 0.2144770622253418s.
-[triton-dejavu] First execution including JIT compilation took 0.28452420234680176s.
-[triton-dejavu] First execution including JIT compilation took 0.24449563026428223s.
-[triton-dejavu] First execution including JIT compilation took 0.2711045742034912s.
-[triton-dejavu] First execution including JIT compilation took 0.30714941024780273s.
-[triton-dejavu] First execution including JIT compilation took 0.2577242851257324s.
-[triton-dejavu] First execution including JIT compilation took 0.23275232315063477s.
-[triton-dejavu] First execution including JIT compilation took 0.32830142974853516s.
-[triton-dejavu] First execution including JIT compilation took 0.25277233123779297s.
-[triton-dejavu] First execution including JIT compilation took 0.23861432075500488s.
-[triton-dejavu] First execution including JIT compilation took 0.31818604469299316s.
-[triton-dejavu] First execution including JIT compilation took 0.26758432388305664s.
-[triton-dejavu] First execution including JIT compilation took 0.2486262321472168s.
-[triton-dejavu] First execution including JIT compilation took 0.3456125259399414s.
-[triton-dejavu] First execution including JIT compilation took 0.33374500274658203s.
-[triton-dejavu] First execution including JIT compilation took 0.2485215663909912s.
-[triton-dejavu] First execution including JIT compilation took 0.30871033668518066s.
-[triton-dejavu] First execution including JIT compilation took 0.22252321243286133s.
-[triton-dejavu] First execution including JIT compilation took 0.20645499229431152s.
-[triton-dejavu] First execution including JIT compilation took 0.3251798152923584s.
-[triton-dejavu] First execution including JIT compilation took 0.2487037181854248s.
-[triton-dejavu] First execution including JIT compilation took 0.22485733032226562s.
-[triton-dejavu] First execution including JIT compilation took 0.33643627166748047s.
-[triton-dejavu] First execution including JIT compilation took 0.2661266326904297s.
-[triton-dejavu] First execution including JIT compilation took 0.2295246124267578s.
-[triton-dejavu] First execution including JIT compilation took 0.38455843925476074s.
-[triton-dejavu] First execution including JIT compilation took 0.2738194465637207s.
-[triton-dejavu] First execution including JIT compilation took 0.24585938453674316s.
-[triton-dejavu] First execution including JIT compilation took 0.4033064842224121s.
-[triton-dejavu] First execution including JIT compilation took 0.2825932502746582s.
-[triton-dejavu] First execution including JIT compilation took 0.2537994384765625s.
-[triton-dejavu] First execution including JIT compilation took 0.4199497699737549s.
-[triton-dejavu] First execution including JIT compilation took 0.2951374053955078s.
-[triton-dejavu] First execution including JIT compilation took 0.2603449821472168s.
-[triton-dejavu] First execution including JIT compilation took 0.4944791793823242s.
-[triton-dejavu] First execution including JIT compilation took 0.3230445384979248s.
-[triton-dejavu] First execution including JIT compilation took 0.29880690574645996s.
-[triton-dejavu] First execution including JIT compilation took 0.4109377861022949s.
-[triton-dejavu] First execution including JIT compilation took 0.27936363220214844s.
-[triton-dejavu] First execution including JIT compilation took 0.23092174530029297s.
-[triton-dejavu] First execution including JIT compilation took 0.428159236907959s.
-[triton-dejavu] First execution including JIT compilation took 0.2879374027252197s.
-[triton-dejavu] First execution including JIT compilation took 0.2565889358520508s.
-[triton-dejavu] First execution including JIT compilation took 0.5160079002380371s.
-[triton-dejavu] First execution including JIT compilation took 0.31639909744262695s.
-[triton-dejavu] First execution including JIT compilation took 0.2591373920440674s.
-[triton-dejavu] First execution including JIT compilation took 0.5452303886413574s.
-[triton-dejavu] First execution including JIT compilation took 0.3242976665496826s.
-[triton-dejavu] First execution including JIT compilation took 0.2623326778411865s.
-[triton-dejavu] First execution including JIT compilation took 0.5922431945800781s.
-[triton-dejavu] First execution including JIT compilation took 0.34310412406921387s.
-[triton-dejavu] First execution including JIT compilation took 0.27410078048706055s.
-[triton-dejavu] First execution including JIT compilation took 0.6470179557800293s.
-[triton-dejavu] First execution including JIT compilation took 0.3510866165161133s.
-[triton-dejavu] First execution including JIT compilation took 0.2860851287841797s.
-[triton-dejavu] First execution including JIT compilation took 0.7252414226531982s.
-[triton-dejavu] First execution including JIT compilation took 0.4050569534301758s.
-[triton-dejavu] First execution including JIT compilation took 0.30899977684020996s.
-[triton-dejavu] First execution including JIT compilation took 0.6849832534790039s.
-[triton-dejavu] First execution including JIT compilation took 0.4114108085632324s.
-[triton-dejavu] First execution including JIT compilation took 0.26646900177001953s.
-[triton-dejavu] First execution including JIT compilation took 0.731346845626831s.
-[triton-dejavu] First execution including JIT compilation took 0.37401604652404785s.
-[triton-dejavu] First execution including JIT compilation took 0.2981090545654297s.
-[triton-dejavu] First execution including JIT compilation took 1.255406141281128s.
-[triton-dejavu] First execution including JIT compilation took 0.4646761417388916s.
-[triton-dejavu] First execution including JIT compilation took 0.31566929817199707s.
-[triton-dejavu] First execution including JIT compilation took 1.3867464065551758s.
-[triton-dejavu] First execution including JIT compilation took 0.4915659427642822s.
-[triton-dejavu] First execution including JIT compilation took 0.336292028427124s.
-[triton-dejavu] First execution including JIT compilation took 1.4543449878692627s.
-[triton-dejavu] First execution including JIT compilation took 0.5164680480957031s.
-[triton-dejavu] First execution including JIT compilation took 0.35480237007141113s.
-[triton-dejavu] First execution including JIT compilation took 1.5047898292541504s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.2522404193878174s.
-[triton-dejavu] First execution including JIT compilation took 0.20596766471862793s.
-[triton-dejavu] First execution including JIT compilation took 0.19162297248840332s.
-[triton-dejavu] First execution including JIT compilation took 0.26123833656311035s.
-[triton-dejavu] First execution including JIT compilation took 0.22704195976257324s.
-[triton-dejavu] First execution including JIT compilation took 0.2119007110595703s.
-[triton-dejavu] First execution including JIT compilation took 0.29358959197998047s.
-[triton-dejavu] First execution including JIT compilation took 0.23833417892456055s.
-[triton-dejavu] First execution including JIT compilation took 0.2248821258544922s.
-[triton-dejavu] First execution including JIT compilation took 0.29287123680114746s.
-[triton-dejavu] First execution including JIT compilation took 0.25087571144104004s.
-[triton-dejavu] First execution including JIT compilation took 0.23653793334960938s.
-[triton-dejavu] First execution including JIT compilation took 0.3085510730743408s.
-[triton-dejavu] First execution including JIT compilation took 0.22148561477661133s.
-[triton-dejavu] First execution including JIT compilation took 0.20212173461914062s.
-[triton-dejavu] First execution including JIT compilation took 0.26024508476257324s.
-[triton-dejavu] First execution including JIT compilation took 0.20862603187561035s.
-[triton-dejavu] First execution including JIT compilation took 0.2089087963104248s.
-[triton-dejavu] First execution including JIT compilation took 0.28104591369628906s.
-[triton-dejavu] First execution including JIT compilation took 0.2973470687866211s.
-[triton-dejavu] First execution including JIT compilation took 0.20228052139282227s.
-[triton-dejavu] First execution including JIT compilation took 0.3051731586456299s.
-[triton-dejavu] First execution including JIT compilation took 0.21213650703430176s.
-[triton-dejavu] First execution including JIT compilation took 0.18593811988830566s.
-[triton-dejavu] First execution including JIT compilation took 0.25663161277770996s.
-[triton-dejavu] First execution including JIT compilation took 0.2507617473602295s.
-[triton-dejavu] First execution including JIT compilation took 0.2245655059814453s.
-[triton-dejavu] First execution including JIT compilation took 0.3247964382171631s.
-[triton-dejavu] First execution including JIT compilation took 0.25675010681152344s.
-[triton-dejavu] First execution including JIT compilation took 0.2267463207244873s.
-[triton-dejavu] First execution including JIT compilation took 0.3354456424713135s.
-[triton-dejavu] First execution including JIT compilation took 0.25986337661743164s.
-[triton-dejavu] First execution including JIT compilation took 0.23006272315979004s.
-[triton-dejavu] First execution including JIT compilation took 0.002689838409423828s.
-[triton-dejavu] First execution including JIT compilation took 0.2550840377807617s.
-[triton-dejavu] First execution including JIT compilation took 0.24457740783691406s.
-[triton-dejavu] First execution including JIT compilation took 0.34949541091918945s.
-[triton-dejavu] First execution including JIT compilation took 0.2756638526916504s.
-[triton-dejavu] First execution including JIT compilation took 0.2527437210083008s.
-[triton-dejavu] First execution including JIT compilation took 0.3787045478820801s.
-[triton-dejavu] First execution including JIT compilation took 0.2873713970184326s.
-[triton-dejavu] First execution including JIT compilation took 0.2592127323150635s.
-[triton-dejavu] First execution including JIT compilation took 0.33814334869384766s.
-[triton-dejavu] First execution including JIT compilation took 0.2517428398132324s.
-[triton-dejavu] First execution including JIT compilation took 0.21767520904541016s.
-[triton-dejavu] First execution including JIT compilation took 0.36879444122314453s.
-[triton-dejavu] First execution including JIT compilation took 0.2698078155517578s.
-[triton-dejavu] First execution including JIT compilation took 0.2365264892578125s.
-[triton-dejavu] First execution including JIT compilation took 0.47873687744140625s.
-[triton-dejavu] First execution including JIT compilation took 0.2871267795562744s.
-[triton-dejavu] First execution including JIT compilation took 0.24500083923339844s.
-[triton-dejavu] First execution including JIT compilation took 0.4963796138763428s.
-[triton-dejavu] First execution including JIT compilation took 0.30948710441589355s.
-[triton-dejavu] First execution including JIT compilation took 0.25137853622436523s.
-[triton-dejavu] First execution including JIT compilation took 0.4584636688232422s.
-[triton-dejavu] First execution including JIT compilation took 0.3162257671356201s.
-[triton-dejavu] First execution including JIT compilation took 0.2994105815887451s.
-[triton-dejavu] First execution including JIT compilation took 0.4786410331726074s.
-[triton-dejavu] First execution including JIT compilation took 0.3190131187438965s.
-[triton-dejavu] First execution including JIT compilation took 0.2704010009765625s.
-[triton-dejavu] First execution including JIT compilation took 0.5149548053741455s.
-[triton-dejavu] First execution including JIT compilation took 0.3378560543060303s.
-[triton-dejavu] First execution including JIT compilation took 0.28589439392089844s.
-[triton-dejavu] First execution including JIT compilation took 0.47048139572143555s.
-[triton-dejavu] First execution including JIT compilation took 0.28485631942749023s.
-[triton-dejavu] First execution including JIT compilation took 0.23804211616516113s.
-[triton-dejavu] First execution including JIT compilation took 0.4914519786834717s.
-[triton-dejavu] First execution including JIT compilation took 0.30657291412353516s.
-[triton-dejavu] First execution including JIT compilation took 0.2527627944946289s.
-[triton-dejavu] First execution including JIT compilation took 0.7375938892364502s.
-[triton-dejavu] First execution including JIT compilation took 0.33797788619995117s.
-[triton-dejavu] First execution including JIT compilation took 0.27010035514831543s.
-[triton-dejavu] First execution including JIT compilation took 0.6535592079162598s.
-[triton-dejavu] First execution including JIT compilation took 0.35745692253112793s.
-[triton-dejavu] First execution including JIT compilation took 0.2837181091308594s.
-[triton-dejavu] First execution including JIT compilation took 0.6975975036621094s.
-[triton-dejavu] First execution including JIT compilation took 0.4016244411468506s.
-[triton-dejavu] First execution including JIT compilation took 0.30004215240478516s.
-[triton-dejavu] First execution including JIT compilation took 0.7542321681976318s.
-[triton-dejavu] First execution including JIT compilation took 0.4112386703491211s.
-[triton-dejavu] First execution including JIT compilation took 0.3136770725250244s.
-[triton-dejavu] First execution including JIT compilation took 0.854001522064209s.
-[triton-dejavu] First execution including JIT compilation took 0.5444228649139404s.
-[triton-dejavu] First execution including JIT compilation took 0.34048891067504883s.
-[triton-dejavu] First execution including JIT compilation took 0.8623373508453369s.
-[triton-dejavu] First execution including JIT compilation took 0.4289731979370117s.
-[triton-dejavu] First execution including JIT compilation took 0.2985663414001465s.
-[triton-dejavu] First execution including JIT compilation took 0.8375389575958252s.
-[triton-dejavu] First execution including JIT compilation took 0.4361135959625244s.
-[triton-dejavu] First execution including JIT compilation took 0.3154265880584717s.
-[triton-dejavu] First execution including JIT compilation took 1.547863483428955s.
-[triton-dejavu] First execution including JIT compilation took 0.511976957321167s.
-[triton-dejavu] First execution including JIT compilation took 0.33377766609191895s.
-[triton-dejavu] First execution including JIT compilation took 1.6245229244232178s.
-[triton-dejavu] First execution including JIT compilation took 0.5486083030700684s.
-[triton-dejavu] First execution including JIT compilation took 0.35009217262268066s.
-[triton-dejavu] First execution including JIT compilation took 1.730604887008667s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.31005024909973145s.
-[triton-dejavu] First execution including JIT compilation took 0.2568016052246094s.
-[triton-dejavu] First execution including JIT compilation took 0.21983957290649414s.
-[triton-dejavu] First execution including JIT compilation took 0.3342258930206299s.
-[triton-dejavu] First execution including JIT compilation took 0.2613508701324463s.
-[triton-dejavu] First execution including JIT compilation took 0.23277544975280762s.
-[triton-dejavu] First execution including JIT compilation took 0.34593868255615234s.
-[triton-dejavu] First execution including JIT compilation took 0.27214527130126953s.
-[triton-dejavu] First execution including JIT compilation took 0.24357295036315918s.
-[triton-dejavu] First execution including JIT compilation took 0.3831825256347656s.
-[triton-dejavu] First execution including JIT compilation took 0.2801399230957031s.
-[triton-dejavu] First execution including JIT compilation took 0.28713178634643555s.
-[triton-dejavu] First execution including JIT compilation took 0.3746922016143799s.
-[triton-dejavu] First execution including JIT compilation took 0.29146361351013184s.
-[triton-dejavu] First execution including JIT compilation took 0.25294995307922363s.
-[triton-dejavu] First execution including JIT compilation took 0.3896350860595703s.
-[triton-dejavu] First execution including JIT compilation took 0.3028104305267334s.
-[triton-dejavu] First execution including JIT compilation took 0.2598695755004883s.
-[triton-dejavu] First execution including JIT compilation took 0.4107673168182373s.
-[triton-dejavu] First execution including JIT compilation took 0.3029160499572754s.
-[triton-dejavu] First execution including JIT compilation took 0.27234864234924316s.
-[triton-dejavu] First execution including JIT compilation took 0.3524813652038574s.
-[triton-dejavu] First execution including JIT compilation took 0.2637143135070801s.
-[triton-dejavu] First execution including JIT compilation took 0.21795105934143066s.
-[triton-dejavu] First execution including JIT compilation took 0.36962461471557617s.
-[triton-dejavu] First execution including JIT compilation took 0.2753579616546631s.
-[triton-dejavu] First execution including JIT compilation took 0.24502253532409668s.
-[triton-dejavu] First execution including JIT compilation took 0.38353514671325684s.
-[triton-dejavu] First execution including JIT compilation took 0.25853633880615234s.
-[triton-dejavu] First execution including JIT compilation took 0.23975038528442383s.
-[triton-dejavu] First execution including JIT compilation took 0.0030221939086914062s.
-[triton-dejavu] First execution including JIT compilation took 0.29683613777160645s.
-[triton-dejavu] First execution including JIT compilation took 0.2580904960632324s.
-[triton-dejavu] First execution including JIT compilation took 0.4290771484375s.
-[triton-dejavu] First execution including JIT compilation took 0.3167991638183594s.
-[triton-dejavu] First execution including JIT compilation took 0.2567250728607178s.
-[triton-dejavu] First execution including JIT compilation took 0.44550418853759766s.
-[triton-dejavu] First execution including JIT compilation took 0.3198390007019043s.
-[triton-dejavu] First execution including JIT compilation took 0.268108606338501s.
-[triton-dejavu] First execution including JIT compilation took 0.4916553497314453s.
-[triton-dejavu] First execution including JIT compilation took 0.3439137935638428s.
-[triton-dejavu] First execution including JIT compilation took 0.27727365493774414s.
-[triton-dejavu] First execution including JIT compilation took 0.460857629776001s.
-[triton-dejavu] First execution including JIT compilation took 0.30243563652038574s.
-[triton-dejavu] First execution including JIT compilation took 0.24333858489990234s.
-[triton-dejavu] First execution including JIT compilation took 0.46892428398132324s.
-[triton-dejavu] First execution including JIT compilation took 0.3167304992675781s.
-[triton-dejavu] First execution including JIT compilation took 0.2599649429321289s.
-[triton-dejavu] First execution including JIT compilation took 0.5126926898956299s.
-[triton-dejavu] First execution including JIT compilation took 0.32805609703063965s.
-[triton-dejavu] First execution including JIT compilation took 0.26161670684814453s.
-[triton-dejavu] First execution including JIT compilation took 0.5467493534088135s.
-[triton-dejavu] First execution including JIT compilation took 0.3979170322418213s.
-[triton-dejavu] First execution including JIT compilation took 0.27261829376220703s.
-[triton-dejavu] First execution including JIT compilation took 0.56540846824646s.
-[triton-dejavu] First execution including JIT compilation took 0.35355091094970703s.
-[triton-dejavu] First execution including JIT compilation took 0.276700496673584s.
-[triton-dejavu] First execution including JIT compilation took 0.5869178771972656s.
-[triton-dejavu] First execution including JIT compilation took 0.3624422550201416s.
-[triton-dejavu] First execution including JIT compilation took 0.35153841972351074s.
-[triton-dejavu] First execution including JIT compilation took 0.6571488380432129s.
-[triton-dejavu] First execution including JIT compilation took 0.3958284854888916s.
-[triton-dejavu] First execution including JIT compilation took 0.30527758598327637s.
-[triton-dejavu] First execution including JIT compilation took 0.6626615524291992s.
-[triton-dejavu] First execution including JIT compilation took 0.3544487953186035s.
-[triton-dejavu] First execution including JIT compilation took 0.2698044776916504s.
-[triton-dejavu] First execution including JIT compilation took 0.6961638927459717s.
-[triton-dejavu] First execution including JIT compilation took 0.38259434700012207s.
-[triton-dejavu] First execution including JIT compilation took 0.283905029296875s.
-[triton-dejavu] First execution including JIT compilation took 0.845867395401001s.
-[triton-dejavu] First execution including JIT compilation took 0.4127688407897949s.
-[triton-dejavu] First execution including JIT compilation took 0.3159315586090088s.
-[triton-dejavu] First execution including JIT compilation took 0.9087560176849365s.
-[triton-dejavu] First execution including JIT compilation took 0.4513425827026367s.
-[triton-dejavu] First execution including JIT compilation took 0.3294107913970947s.
-[triton-dejavu] First execution including JIT compilation took 0.9571695327758789s.
-[triton-dejavu] First execution including JIT compilation took 0.4684031009674072s.
-[triton-dejavu] First execution including JIT compilation took 0.32914233207702637s.
-[triton-dejavu] First execution including JIT compilation took 1.0359725952148438s.
-[triton-dejavu] First execution including JIT compilation took 0.48003387451171875s.
-[triton-dejavu] First execution including JIT compilation took 0.34207773208618164s.
-[triton-dejavu] First execution including JIT compilation took 1.1305363178253174s.
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.2873709201812744s.
-[triton-dejavu] First execution including JIT compilation took 0.534212589263916s.
-[triton-dejavu] First execution including JIT compilation took 0.34093737602233887s.
-[triton-dejavu] First execution including JIT compilation took 1.2213225364685059s.
-[triton-dejavu] First execution including JIT compilation took 0.5500822067260742s.
-[triton-dejavu] First execution including JIT compilation took 0.3482015132904053s.
-[triton-dejavu] First execution including JIT compilation took 2.321138620376587s.
-[triton-dejavu] First execution including JIT compilation took 0.5398764610290527s.
-[triton-dejavu] First execution including JIT compilation took 0.3589463233947754s.
-[triton-dejavu] First execution including JIT compilation took 2.2305822372436523s.
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.4210233688354492s.
-[triton-dejavu] First execution including JIT compilation took 0.2671318054199219s.
-[triton-dejavu] First execution including JIT compilation took 0.20480823516845703s.
-[triton-dejavu] First execution including JIT compilation took 0.36168575286865234s.
-[triton-dejavu] First execution including JIT compilation took 0.2831258773803711s.
-[triton-dejavu] First execution including JIT compilation took 0.22981572151184082s.
-[triton-dejavu] First execution including JIT compilation took 0.3903160095214844s.
-[triton-dejavu] First execution including JIT compilation took 0.2804446220397949s.
-[triton-dejavu] First execution including JIT compilation took 0.22222447395324707s.
-[triton-dejavu] First execution including JIT compilation took 0.3762855529785156s.
-[triton-dejavu] First execution including JIT compilation took 0.2824244499206543s.
-[triton-dejavu] First execution including JIT compilation took 0.22802186012268066s.
-[triton-dejavu] First execution including JIT compilation took 0.4042317867279053s.
-[triton-dejavu] First execution including JIT compilation took 0.2850303649902344s.
-[triton-dejavu] First execution including JIT compilation took 0.22367358207702637s.
-[triton-dejavu] First execution including JIT compilation took 0.45253777503967285s.
-[triton-dejavu] First execution including JIT compilation took 0.3078906536102295s.
-[triton-dejavu] First execution including JIT compilation took 0.23833608627319336s.
-[triton-dejavu] First execution including JIT compilation took 0.47820162773132324s.
-[triton-dejavu] First execution including JIT compilation took 0.332599401473999s.
-[triton-dejavu] First execution including JIT compilation took 0.256058931350708s.
-[triton-dejavu] First execution including JIT compilation took 0.4336233139038086s.
-[triton-dejavu] First execution including JIT compilation took 0.2906990051269531s.
-[triton-dejavu] First execution including JIT compilation took 0.22593021392822266s.
-[triton-dejavu] First execution including JIT compilation took 0.43659496307373047s.
-[triton-dejavu] First execution including JIT compilation took 0.295365571975708s.
-[triton-dejavu] First execution including JIT compilation took 0.3340928554534912s.
-[triton-dejavu] First execution including JIT compilation took 0.4568207263946533s.
-[triton-dejavu] First execution including JIT compilation took 0.34474825859069824s.
-[triton-dejavu] First execution including JIT compilation took 0.2425243854522705s.
-[triton-dejavu] First execution including JIT compilation took 0.48821306228637695s.
-[triton-dejavu] First execution including JIT compilation took 0.0030794143676757812s.
-[triton-dejavu] First execution including JIT compilation took 0.666248083114624s.
-[triton-dejavu] First execution including JIT compilation took 0.9645810127258301s.
-[triton-dejavu] First execution including JIT compilation took 0.43552494049072266s.
-[triton-dejavu] First execution including JIT compilation took 0.3096005916595459s.
-[triton-dejavu] First execution including JIT compilation took 0.6565234661102295s.
-[triton-dejavu] First execution including JIT compilation took 0.49860286712646484s.
-[triton-dejavu] First execution including JIT compilation took 0.31050992012023926s.
-[triton-dejavu] First execution including JIT compilation took 0.696462869644165s.
-[triton-dejavu] First execution including JIT compilation took 0.4284684658050537s.
-[triton-dejavu] First execution including JIT compilation took 0.3179745674133301s.
-[triton-dejavu] First execution including JIT compilation took 0.6832358837127686s.
-[triton-dejavu] First execution including JIT compilation took 0.4428989887237549s.
-[triton-dejavu] First execution including JIT compilation took 0.27704954147338867s.
-[triton-dejavu] First execution including JIT compilation took 0.7220911979675293s.
-[triton-dejavu] First execution including JIT compilation took 0.4185624122619629s.
-[triton-dejavu] First execution including JIT compilation took 0.2997853755950928s.
-[triton-dejavu] First execution including JIT compilation took 0.769294023513794s.
-[triton-dejavu] First execution including JIT compilation took 0.44492197036743164s.
-[triton-dejavu] First execution including JIT compilation took 0.3919029235839844s.
-[triton-dejavu] First execution including JIT compilation took 0.8174152374267578s.
-[triton-dejavu] First execution including JIT compilation took 0.4800558090209961s.
-[triton-dejavu] First execution including JIT compilation took 0.3278632164001465s.
-[triton-dejavu] First execution including JIT compilation took 0.8820762634277344s.
-[triton-dejavu] First execution including JIT compilation took 0.4979724884033203s.
-[triton-dejavu] First execution including JIT compilation took 0.3491017818450928s.
-[triton-dejavu] First execution including JIT compilation took 0.9607341289520264s.
-[triton-dejavu] First execution including JIT compilation took 0.5307338237762451s.
-[triton-dejavu] First execution including JIT compilation took 0.3707716464996338s.
-[triton-dejavu] First execution including JIT compilation took 1.0402915477752686s.
-[triton-dejavu] First execution including JIT compilation took 0.5747923851013184s.
-[triton-dejavu] First execution including JIT compilation took 0.3863534927368164s.
-[triton-dejavu] First execution including JIT compilation took 1.1301944255828857s.
-[triton-dejavu] First execution including JIT compilation took 0.5652072429656982s.
-[triton-dejavu] First execution including JIT compilation took 0.34700870513916016s.
-[triton-dejavu] First execution including JIT compilation took 1.0828943252563477s.
-[triton-dejavu] First execution including JIT compilation took 0.5150623321533203s.
-[triton-dejavu] First execution including JIT compilation took 0.2979111671447754s.
-[triton-dejavu] First execution including JIT compilation took 1.529569149017334s.
-[triton-dejavu] First execution including JIT compilation took 0.6551094055175781s.
-[triton-dejavu] First execution including JIT compilation took 0.38155317306518555s.
-[triton-dejavu] First execution including JIT compilation took 1.766725778579712s.
-[triton-dejavu] First execution including JIT compilation took 0.6843061447143555s.
-[triton-dejavu] First execution including JIT compilation took 0.4045257568359375s.
-[triton-dejavu] First execution including JIT compilation took 1.8947250843048096s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.4253039360046387s.
-[triton-dejavu] First execution including JIT compilation took 1.1816015243530273s.
-[triton-dejavu] First execution including JIT compilation took 0.4922316074371338s.
-[triton-dejavu] First execution including JIT compilation took 2.298893451690674s.
-[triton-dejavu] First execution including JIT compilation took 1.2072784900665283s.
-[triton-dejavu] First execution including JIT compilation took 0.5224888324737549s.
-[triton-dejavu] First execution including JIT compilation took 6.951720952987671s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.6793637275695801s.
-[triton-dejavu] First execution including JIT compilation took 0.6418313980102539s.
-[triton-dejavu] First execution including JIT compilation took 0.32692575454711914s.
-[triton-dejavu] First execution including JIT compilation took 0.8132402896881104s.
-[triton-dejavu] First execution including JIT compilation took 0.5734570026397705s.
-[triton-dejavu] First execution including JIT compilation took 0.35916733741760254s.
-[triton-dejavu] First execution including JIT compilation took 0.8288097381591797s.
-[triton-dejavu] First execution including JIT compilation took 0.5815913677215576s.
-[triton-dejavu] First execution including JIT compilation took 0.39780449867248535s.
-[triton-dejavu] First execution including JIT compilation took 0.8627204895019531s.
-[triton-dejavu] First execution including JIT compilation took 0.5819535255432129s.
-[triton-dejavu] First execution including JIT compilation took 0.3681964874267578s.
-[triton-dejavu] First execution including JIT compilation took 0.8420388698577881s.
-[triton-dejavu] First execution including JIT compilation took 0.5943279266357422s.
-[triton-dejavu] First execution including JIT compilation took 0.36092662811279297s.
-[triton-dejavu] First execution including JIT compilation took 0.8624413013458252s.
-[triton-dejavu] First execution including JIT compilation took 0.5882468223571777s.
-[triton-dejavu] First execution including JIT compilation took 0.3868570327758789s.
-[triton-dejavu] First execution including JIT compilation took 0.9039130210876465s.
-[triton-dejavu] First execution including JIT compilation took 0.6410880088806152s.
-[triton-dejavu] First execution including JIT compilation took 0.3988831043243408s.
-[triton-dejavu] First execution including JIT compilation took 0.9115607738494873s.
-[triton-dejavu] First execution including JIT compilation took 0.5902762413024902s.
-[triton-dejavu] First execution including JIT compilation took 0.34618401527404785s.
-[triton-dejavu] First execution including JIT compilation took 0.9532392024993896s.
-[triton-dejavu] First execution including JIT compilation took 0.635444164276123s.
-[triton-dejavu] First execution including JIT compilation took 0.3781321048736572s.
-[triton-dejavu] First execution including JIT compilation took 1.0092928409576416s.
-[triton-dejavu] First execution including JIT compilation took 0.6709246635437012s.
-[triton-dejavu] First execution including JIT compilation took 0.38914012908935547s.
-[triton-dejavu] First execution including JIT compilation took 1.0928781032562256s.
-[triton-dejavu] First execution including JIT compilation took 0.003269672393798828s.
-[triton-dejavu] First execution including JIT compilation took 0.3971376419067383s.
-[triton-dejavu] First execution including JIT compilation took 1.0150482654571533s.
-[triton-dejavu] First execution including JIT compilation took 0.6963634490966797s.
-[triton-dejavu] First execution including JIT compilation took 0.40409111976623535s.
-[triton-dejavu] First execution including JIT compilation took 1.1118721961975098s.
-[triton-dejavu] First execution including JIT compilation took 0.6946852207183838s.
-[triton-dejavu] First execution including JIT compilation took 0.4175405502319336s.
-[triton-dejavu] First execution including JIT compilation took 1.177678108215332s.
-[triton-dejavu] First execution including JIT compilation took 0.7395210266113281s.
-[triton-dejavu] First execution including JIT compilation took 0.4114506244659424s.
-[triton-dejavu] First execution including JIT compilation took 1.3139593601226807s.
-[triton-dejavu] First execution including JIT compilation took 0.7956829071044922s.
-[triton-dejavu] First execution including JIT compilation took 0.39879727363586426s.
-[triton-dejavu] First execution including JIT compilation took 1.328862190246582s.
-[triton-dejavu] First execution including JIT compilation took 0.6896190643310547s.
-[triton-dejavu] First execution including JIT compilation took 0.33518552780151367s.
-[triton-dejavu] First execution including JIT compilation took 1.5963466167449951s.
-[triton-dejavu] First execution including JIT compilation took 0.7468023300170898s.
-[triton-dejavu] First execution including JIT compilation took 0.4028303623199463s.
-[triton-dejavu] First execution including JIT compilation took 1.7192442417144775s.
-[triton-dejavu] First execution including JIT compilation took 0.7985246181488037s.
-[triton-dejavu] First execution including JIT compilation took 0.37429141998291016s.
-[triton-dejavu] First execution including JIT compilation took 1.7711453437805176s.
-[triton-dejavu] First execution including JIT compilation took 0.9907217025756836s.
-[triton-dejavu] First execution including JIT compilation took 0.49533724784851074s.
-[triton-dejavu] First execution including JIT compilation took 2.3483541011810303s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.211524486541748s.
-[triton-dejavu] First execution including JIT compilation took 1.2513744831085205s.
-[triton-dejavu] First execution including JIT compilation took 0.4271657466888428s.
-[triton-dejavu] First execution including JIT compilation took 2.204448938369751s.
-[triton-dejavu] First execution including JIT compilation took 1.185486078262329s.
-[triton-dejavu] First execution including JIT compilation took 0.5369844436645508s.
-[triton-dejavu] First execution including JIT compilation took 6.1959922313690186s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.166867017745972s.
-[triton-dejavu] First execution including JIT compilation took 3.8762850761413574s.
-[triton-dejavu] First execution including JIT compilation took 0.7830004692077637s.
-[triton-dejavu] First execution including JIT compilation took 6.120173931121826s.
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1146880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1146880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.33310794830322266s.
-[triton-dejavu] First execution including JIT compilation took 0.23391294479370117s.
-[triton-dejavu] First execution including JIT compilation took 0.2084214687347412s.
-[triton-dejavu] First execution including JIT compilation took 0.3005564212799072s.
-[triton-dejavu] First execution including JIT compilation took 0.2554941177368164s.
-[triton-dejavu] First execution including JIT compilation took 0.21907782554626465s.
-[triton-dejavu] First execution including JIT compilation took 0.31569480895996094s.
-[triton-dejavu] First execution including JIT compilation took 0.2694690227508545s.
-[triton-dejavu] First execution including JIT compilation took 0.22438645362854004s.
-[triton-dejavu] First execution including JIT compilation took 0.3229238986968994s.
-[triton-dejavu] First execution including JIT compilation took 0.2797393798828125s.
-[triton-dejavu] First execution including JIT compilation took 0.23162508010864258s.
-[triton-dejavu] First execution including JIT compilation took 0.3365445137023926s.
-[triton-dejavu] First execution including JIT compilation took 0.2754044532775879s.
-[triton-dejavu] First execution including JIT compilation took 0.22548437118530273s.
-[triton-dejavu] First execution including JIT compilation took 0.3373396396636963s.
-[triton-dejavu] First execution including JIT compilation took 0.2857697010040283s.
-[triton-dejavu] First execution including JIT compilation took 0.2294597625732422s.
-[triton-dejavu] First execution including JIT compilation took 0.3634176254272461s.
-[triton-dejavu] First execution including JIT compilation took 0.294708251953125s.
-[triton-dejavu] First execution including JIT compilation took 0.24236321449279785s.
-[triton-dejavu] First execution including JIT compilation took 0.3028702735900879s.
-[triton-dejavu] First execution including JIT compilation took 0.2470991611480713s.
-[triton-dejavu] First execution including JIT compilation took 0.21360516548156738s.
-[triton-dejavu] First execution including JIT compilation took 0.3189256191253662s.
-[triton-dejavu] First execution including JIT compilation took 0.25740885734558105s.
-[triton-dejavu] First execution including JIT compilation took 0.23542547225952148s.
-[triton-dejavu] First execution including JIT compilation took 0.34380078315734863s.
-[triton-dejavu] First execution including JIT compilation took 0.2774670124053955s.
-[triton-dejavu] First execution including JIT compilation took 0.24950265884399414s.
-[triton-dejavu] First execution including JIT compilation took 0.4161198139190674s.
-[triton-dejavu] First execution including JIT compilation took 0.28986072540283203s.
-[triton-dejavu] First execution including JIT compilation took 0.2589759826660156s.
-[triton-dejavu] First execution including JIT compilation took 0.41210365295410156s.
-[triton-dejavu] First execution including JIT compilation took 0.32729268074035645s.
-[triton-dejavu] First execution including JIT compilation took 0.25850629806518555s.
-[triton-dejavu] First execution including JIT compilation took 0.4299044609069824s.
-[triton-dejavu] First execution including JIT compilation took 0.3116121292114258s.
-[triton-dejavu] First execution including JIT compilation took 0.27123379707336426s.
-[triton-dejavu] First execution including JIT compilation took 0.45281362533569336s.
-[triton-dejavu] First execution including JIT compilation took 0.3351759910583496s.
-[triton-dejavu] First execution including JIT compilation took 0.2787160873413086s.
-[triton-dejavu] First execution including JIT compilation took 0.41561436653137207s.
-[triton-dejavu] First execution including JIT compilation took 0.27190589904785156s.
-[triton-dejavu] First execution including JIT compilation took 0.2324838638305664s.
-[triton-dejavu] First execution including JIT compilation took 0.4087650775909424s.
-[triton-dejavu] First execution including JIT compilation took 0.28690099716186523s.
-[triton-dejavu] First execution including JIT compilation took 0.24116730690002441s.
-[triton-dejavu] First execution including JIT compilation took 0.5066123008728027s.
-[triton-dejavu] First execution including JIT compilation took 0.3034372329711914s.
-[triton-dejavu] First execution including JIT compilation took 0.25580596923828125s.
-[triton-dejavu] First execution including JIT compilation took 0.525223970413208s.
-[triton-dejavu] First execution including JIT compilation took 0.33296680450439453s.
-[triton-dejavu] First execution including JIT compilation took 0.27128124237060547s.
-[triton-dejavu] First execution including JIT compilation took 0.5657172203063965s.
-[triton-dejavu] First execution including JIT compilation took 0.3399391174316406s.
-[triton-dejavu] First execution including JIT compilation took 0.28380680084228516s.
-[triton-dejavu] First execution including JIT compilation took 0.6111602783203125s.
-[triton-dejavu] First execution including JIT compilation took 0.36371636390686035s.
-[triton-dejavu] First execution including JIT compilation took 0.3011593818664551s.
-[triton-dejavu] First execution including JIT compilation took 0.7230055332183838s.
-[triton-dejavu] First execution including JIT compilation took 0.4232914447784424s.
-[triton-dejavu] First execution including JIT compilation took 0.31528306007385254s.
-[triton-dejavu] First execution including JIT compilation took 0.6461219787597656s.
-[triton-dejavu] First execution including JIT compilation took 0.36070823669433594s.
-[triton-dejavu] First execution including JIT compilation took 0.2686340808868408s.
-[triton-dejavu] First execution including JIT compilation took 0.6663899421691895s.
-[triton-dejavu] First execution including JIT compilation took 0.3726685047149658s.
-[triton-dejavu] First execution including JIT compilation took 0.2806117534637451s.
-[triton-dejavu] First execution including JIT compilation took 1.2110939025878906s.
-[triton-dejavu] First execution including JIT compilation took 0.43669724464416504s.
-[triton-dejavu] First execution including JIT compilation took 0.29979729652404785s.
-[triton-dejavu] First execution including JIT compilation took 1.2734310626983643s.
-[triton-dejavu] First execution including JIT compilation took 0.4524815082550049s.
-[triton-dejavu] First execution including JIT compilation took 0.30893588066101074s.
-[triton-dejavu] First execution including JIT compilation took 1.3412039279937744s.
-[triton-dejavu] First execution including JIT compilation took 0.4808011054992676s.
-[triton-dejavu] First execution including JIT compilation took 0.32793712615966797s.
-[triton-dejavu] First execution including JIT compilation took 1.3745002746582031s.
-[triton-dejavu] First execution including JIT compilation took 0.4310414791107178s.
-[triton-dejavu] First execution including JIT compilation took 0.2714354991912842s.
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.2250773906707764s.
-[triton-dejavu] First execution including JIT compilation took 0.4646158218383789s.
-[triton-dejavu] First execution including JIT compilation took 0.2895984649658203s.
-[triton-dejavu] First execution including JIT compilation took 1.310636043548584s.
-[triton-dejavu] First execution including JIT compilation took 0.5870482921600342s.
-[triton-dejavu] First execution including JIT compilation took 0.36336755752563477s.
-[triton-dejavu] First execution including JIT compilation took 5.4522223472595215s.
-[triton-dejavu] First execution including JIT compilation took 0.9788007736206055s.
-[triton-dejavu] First execution including JIT compilation took 0.3662402629852295s.
-[triton-dejavu] First execution including JIT compilation took 5.491236209869385s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.3520832061767578s.
-[triton-dejavu] First execution including JIT compilation took 0.3299834728240967s.
-[triton-dejavu] First execution including JIT compilation took 0.21879220008850098s.
-[triton-dejavu] First execution including JIT compilation took 0.3383035659790039s.
-[triton-dejavu] First execution including JIT compilation took 0.2801651954650879s.
-[triton-dejavu] First execution including JIT compilation took 0.23688268661499023s.
-[triton-dejavu] First execution including JIT compilation took 0.3647580146789551s.
-[triton-dejavu] First execution including JIT compilation took 0.2947418689727783s.
-[triton-dejavu] First execution including JIT compilation took 0.27162957191467285s.
-[triton-dejavu] First execution including JIT compilation took 0.38416576385498047s.
-[triton-dejavu] First execution including JIT compilation took 0.30350494384765625s.
-[triton-dejavu] First execution including JIT compilation took 0.2574441432952881s.
-[triton-dejavu] First execution including JIT compilation took 0.4012939929962158s.
-[triton-dejavu] First execution including JIT compilation took 0.31052398681640625s.
-[triton-dejavu] First execution including JIT compilation took 0.26515769958496094s.
-[triton-dejavu] First execution including JIT compilation took 0.39237308502197266s.
-[triton-dejavu] First execution including JIT compilation took 0.3096015453338623s.
-[triton-dejavu] First execution including JIT compilation took 0.262850284576416s.
-[triton-dejavu] First execution including JIT compilation took 0.4091818332672119s.
-[triton-dejavu] First execution including JIT compilation took 0.321216344833374s.
-[triton-dejavu] First execution including JIT compilation took 0.2732977867126465s.
-[triton-dejavu] First execution including JIT compilation took 0.4301795959472656s.
-[triton-dejavu] First execution including JIT compilation took 0.2619006633758545s.
-[triton-dejavu] First execution including JIT compilation took 0.22098731994628906s.
-[triton-dejavu] First execution including JIT compilation took 0.37630200386047363s.
-[triton-dejavu] First execution including JIT compilation took 0.34072089195251465s.
-[triton-dejavu] First execution including JIT compilation took 0.23756647109985352s.
-[triton-dejavu] First execution including JIT compilation took 0.41828155517578125s.
-[triton-dejavu] First execution including JIT compilation took 0.30147528648376465s.
-[triton-dejavu] First execution including JIT compilation took 0.2543652057647705s.
-[triton-dejavu] First execution including JIT compilation took 0.43347787857055664s.
-[triton-dejavu] First execution including JIT compilation took 0.0028808116912841797s.
-[triton-dejavu] First execution including JIT compilation took 0.2641146183013916s.
-[triton-dejavu] First execution including JIT compilation took 0.530811071395874s.
-[triton-dejavu] First execution including JIT compilation took 0.3217940330505371s.
-[triton-dejavu] First execution including JIT compilation took 0.28223276138305664s.
-[triton-dejavu] First execution including JIT compilation took 0.47287917137145996s.
-[triton-dejavu] First execution including JIT compilation took 0.3476870059967041s.
-[triton-dejavu] First execution including JIT compilation took 0.28547072410583496s.
-[triton-dejavu] First execution including JIT compilation took 0.524724006652832s.
-[triton-dejavu] First execution including JIT compilation took 0.36275696754455566s.
-[triton-dejavu] First execution including JIT compilation took 0.2947351932525635s.
-[triton-dejavu] First execution including JIT compilation took 0.4834451675415039s.
-[triton-dejavu] First execution including JIT compilation took 0.31950998306274414s.
-[triton-dejavu] First execution including JIT compilation took 0.2404794692993164s.
-[triton-dejavu] First execution including JIT compilation took 0.5020296573638916s.
-[triton-dejavu] First execution including JIT compilation took 0.32535886764526367s.
-[triton-dejavu] First execution including JIT compilation took 0.2655165195465088s.
-[triton-dejavu] First execution including JIT compilation took 0.5749058723449707s.
-[triton-dejavu] First execution including JIT compilation took 0.35364580154418945s.
-[triton-dejavu] First execution including JIT compilation took 0.26879262924194336s.
-[triton-dejavu] First execution including JIT compilation took 0.5956721305847168s.
-[triton-dejavu] First execution including JIT compilation took 0.3535337448120117s.
-[triton-dejavu] First execution including JIT compilation took 0.28673887252807617s.
-[triton-dejavu] First execution including JIT compilation took 0.6367616653442383s.
-[triton-dejavu] First execution including JIT compilation took 0.3775303363800049s.
-[triton-dejavu] First execution including JIT compilation took 0.3009200096130371s.
-[triton-dejavu] First execution including JIT compilation took 0.6813859939575195s.
-[triton-dejavu] First execution including JIT compilation took 0.4030306339263916s.
-[triton-dejavu] First execution including JIT compilation took 0.29657673835754395s.
-[triton-dejavu] First execution including JIT compilation took 0.7835328578948975s.
-[triton-dejavu] First execution including JIT compilation took 0.4300117492675781s.
-[triton-dejavu] First execution including JIT compilation took 0.3349874019622803s.
-[triton-dejavu] First execution including JIT compilation took 0.7587988376617432s.
-[triton-dejavu] First execution including JIT compilation took 0.4517331123352051s.
-[triton-dejavu] First execution including JIT compilation took 0.28656530380249023s.
-[triton-dejavu] First execution including JIT compilation took 0.7893960475921631s.
-[triton-dejavu] First execution including JIT compilation took 0.4339447021484375s.
-[triton-dejavu] First execution including JIT compilation took 0.3180868625640869s.
-[triton-dejavu] First execution including JIT compilation took 1.3489327430725098s.
-[triton-dejavu] First execution including JIT compilation took 0.5434455871582031s.
-[triton-dejavu] First execution including JIT compilation took 0.35109591484069824s.
-[triton-dejavu] First execution including JIT compilation took 1.4147284030914307s.
-[triton-dejavu] First execution including JIT compilation took 0.5231330394744873s.
-[triton-dejavu] First execution including JIT compilation took 0.35427212715148926s.
-[triton-dejavu] First execution including JIT compilation took 1.4746348857879639s.
-[triton-dejavu] First execution including JIT compilation took 0.561915397644043s.
-[triton-dejavu] First execution including JIT compilation took 0.3480250835418701s.
-[triton-dejavu] First execution including JIT compilation took 1.5745019912719727s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.5983943939208984s.
-[triton-dejavu] First execution including JIT compilation took 0.6874723434448242s.
-[triton-dejavu] First execution including JIT compilation took 0.3813052177429199s.
-[triton-dejavu] First execution including JIT compilation took 1.7068426609039307s.
-[triton-dejavu] First execution including JIT compilation took 0.6691153049468994s.
-[triton-dejavu] First execution including JIT compilation took 0.39803266525268555s.
-[triton-dejavu] First execution including JIT compilation took 5.79765248298645s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.3679988384246826s.
-[triton-dejavu] First execution including JIT compilation took 0.2748281955718994s.
-[triton-dejavu] First execution including JIT compilation took 0.2133641242980957s.
-[triton-dejavu] First execution including JIT compilation took 0.42013049125671387s.
-[triton-dejavu] First execution including JIT compilation took 0.30323004722595215s.
-[triton-dejavu] First execution including JIT compilation took 0.2673945426940918s.
-[triton-dejavu] First execution including JIT compilation took 0.49060797691345215s.
-[triton-dejavu] First execution including JIT compilation took 0.37160611152648926s.
-[triton-dejavu] First execution including JIT compilation took 0.2765369415283203s.
-[triton-dejavu] First execution including JIT compilation took 0.5039165019989014s.
-[triton-dejavu] First execution including JIT compilation took 0.37934136390686035s.
-[triton-dejavu] First execution including JIT compilation took 0.2831428050994873s.
-[triton-dejavu] First execution including JIT compilation took 0.5394682884216309s.
-[triton-dejavu] First execution including JIT compilation took 0.37555360794067383s.
-[triton-dejavu] First execution including JIT compilation took 0.2974073886871338s.
-[triton-dejavu] First execution including JIT compilation took 0.5276486873626709s.
-[triton-dejavu] First execution including JIT compilation took 0.39134836196899414s.
-[triton-dejavu] First execution including JIT compilation took 0.2950737476348877s.
-[triton-dejavu] First execution including JIT compilation took 0.5684738159179688s.
-[triton-dejavu] First execution including JIT compilation took 0.41124916076660156s.
-[triton-dejavu] First execution including JIT compilation took 0.3004477024078369s.
-[triton-dejavu] First execution including JIT compilation took 0.5164830684661865s.
-[triton-dejavu] First execution including JIT compilation took 0.33581042289733887s.
-[triton-dejavu] First execution including JIT compilation took 0.27167344093322754s.
-[triton-dejavu] First execution including JIT compilation took 0.5106401443481445s.
-[triton-dejavu] First execution including JIT compilation took 0.37090396881103516s.
-[triton-dejavu] First execution including JIT compilation took 0.2658994197845459s.
-[triton-dejavu] First execution including JIT compilation took 0.5844974517822266s.
-[triton-dejavu] First execution including JIT compilation took 0.3731074333190918s.
-[triton-dejavu] First execution including JIT compilation took 0.31909990310668945s.
-[triton-dejavu] First execution including JIT compilation took 0.5862879753112793s.
-[triton-dejavu] First execution including JIT compilation took 0.0029494762420654297s.
-[triton-dejavu] First execution including JIT compilation took 0.290740966796875s.
-[triton-dejavu] First execution including JIT compilation took 0.6013433933258057s.
-[triton-dejavu] First execution including JIT compilation took 0.4201853275299072s.
-[triton-dejavu] First execution including JIT compilation took 0.30014801025390625s.
-[triton-dejavu] First execution including JIT compilation took 0.6341700553894043s.
-[triton-dejavu] First execution including JIT compilation took 0.4125685691833496s.
-[triton-dejavu] First execution including JIT compilation took 0.3149580955505371s.
-[triton-dejavu] First execution including JIT compilation took 0.7038888931274414s.
-[triton-dejavu] First execution including JIT compilation took 0.44381022453308105s.
-[triton-dejavu] First execution including JIT compilation took 0.3345675468444824s.
-[triton-dejavu] First execution including JIT compilation took 0.7102112770080566s.
-[triton-dejavu] First execution including JIT compilation took 0.39132046699523926s.
-[triton-dejavu] First execution including JIT compilation took 0.2966330051422119s.
-[triton-dejavu] First execution including JIT compilation took 0.7261581420898438s.
-[triton-dejavu] First execution including JIT compilation took 0.42345237731933594s.
-[triton-dejavu] First execution including JIT compilation took 0.31378960609436035s.
-[triton-dejavu] First execution including JIT compilation took 0.7939469814300537s.
-[triton-dejavu] First execution including JIT compilation took 0.45282721519470215s.
-[triton-dejavu] First execution including JIT compilation took 0.3177626132965088s.
-[triton-dejavu] First execution including JIT compilation took 0.8336560726165771s.
-[triton-dejavu] First execution including JIT compilation took 0.35431385040283203s.
-[triton-dejavu] First execution including JIT compilation took 0.32625389099121094s.
-[triton-dejavu] First execution including JIT compilation took 0.768075704574585s.
-[triton-dejavu] First execution including JIT compilation took 0.3967933654785156s.
-[triton-dejavu] First execution including JIT compilation took 0.27690625190734863s.
-[triton-dejavu] First execution including JIT compilation took 0.9250342845916748s.
-[triton-dejavu] First execution including JIT compilation took 0.49423885345458984s.
-[triton-dejavu] First execution including JIT compilation took 0.34920620918273926s.
-[triton-dejavu] First execution including JIT compilation took 1.0775840282440186s.
-[triton-dejavu] First execution including JIT compilation took 0.5416042804718018s.
-[triton-dejavu] First execution including JIT compilation took 0.38259077072143555s.
-[triton-dejavu] First execution including JIT compilation took 1.1039273738861084s.
-[triton-dejavu] First execution including JIT compilation took 0.526303768157959s.
-[triton-dejavu] First execution including JIT compilation took 0.34534621238708496s.
-[triton-dejavu] First execution including JIT compilation took 1.1143405437469482s.
-[triton-dejavu] First execution including JIT compilation took 0.5508031845092773s.
-[triton-dejavu] First execution including JIT compilation took 0.37677478790283203s.
-[triton-dejavu] First execution including JIT compilation took 1.8315963745117188s.
-[triton-dejavu] First execution including JIT compilation took 0.6505274772644043s.
-[triton-dejavu] First execution including JIT compilation took 0.39488959312438965s.
-[triton-dejavu] First execution including JIT compilation took 1.9625489711761475s.
-[triton-dejavu] First execution including JIT compilation took 0.6776554584503174s.
-[triton-dejavu] First execution including JIT compilation took 0.41101694107055664s.
-[triton-dejavu] First execution including JIT compilation took 2.0118651390075684s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.172940969467163s.
-[triton-dejavu] First execution including JIT compilation took 0.9175102710723877s.
-[triton-dejavu] First execution including JIT compilation took 0.42366957664489746s.
-[triton-dejavu] First execution including JIT compilation took 2.0173258781433105s.
-[triton-dejavu] First execution including JIT compilation took 0.7885754108428955s.
-[triton-dejavu] First execution including JIT compilation took 0.44706130027770996s.
-[triton-dejavu] First execution including JIT compilation took 7.324063301086426s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.7107067108154297s.
-[triton-dejavu] First execution including JIT compilation took 0.40702342987060547s.
-[triton-dejavu] First execution including JIT compilation took 0.271730899810791s.
-[triton-dejavu] First execution including JIT compilation took 0.7312145233154297s.
-[triton-dejavu] First execution including JIT compilation took 0.4074242115020752s.
-[triton-dejavu] First execution including JIT compilation took 0.2868027687072754s.
-[triton-dejavu] First execution including JIT compilation took 0.7007474899291992s.
-[triton-dejavu] First execution including JIT compilation took 0.42080259323120117s.
-[triton-dejavu] First execution including JIT compilation took 0.27320408821105957s.
-[triton-dejavu] First execution including JIT compilation took 0.722841739654541s.
-[triton-dejavu] First execution including JIT compilation took 0.5550060272216797s.
-[triton-dejavu] First execution including JIT compilation took 0.32157206535339355s.
-[triton-dejavu] First execution including JIT compilation took 0.8072361946105957s.
-[triton-dejavu] First execution including JIT compilation took 0.43352651596069336s.
-[triton-dejavu] First execution including JIT compilation took 0.2982165813446045s.
-[triton-dejavu] First execution including JIT compilation took 0.7527244091033936s.
-[triton-dejavu] First execution including JIT compilation took 0.4649670124053955s.
-[triton-dejavu] First execution including JIT compilation took 0.3391098976135254s.
-[triton-dejavu] First execution including JIT compilation took 0.936931848526001s.
-[triton-dejavu] First execution including JIT compilation took 0.46184659004211426s.
-[triton-dejavu] First execution including JIT compilation took 0.2983987331390381s.
-[triton-dejavu] First execution including JIT compilation took 0.7631199359893799s.
-[triton-dejavu] First execution including JIT compilation took 0.39908528327941895s.
-[triton-dejavu] First execution including JIT compilation took 0.32989048957824707s.
-[triton-dejavu] First execution including JIT compilation took 0.7596316337585449s.
-[triton-dejavu] First execution including JIT compilation took 0.43782997131347656s.
-[triton-dejavu] First execution including JIT compilation took 0.3047447204589844s.
-[triton-dejavu] First execution including JIT compilation took 0.8982362747192383s.
-[triton-dejavu] First execution including JIT compilation took 0.4925217628479004s.
-[triton-dejavu] First execution including JIT compilation took 0.3316771984100342s.
-[triton-dejavu] First execution including JIT compilation took 0.864621639251709s.
-[triton-dejavu] First execution including JIT compilation took 0.016417741775512695s.
-[triton-dejavu] First execution including JIT compilation took 0.3927609920501709s.
-[triton-dejavu] First execution including JIT compilation took 0.8940439224243164s.
-[triton-dejavu] First execution including JIT compilation took 0.4808948040008545s.
-[triton-dejavu] First execution including JIT compilation took 0.3320889472961426s.
-[triton-dejavu] First execution including JIT compilation took 0.9511239528656006s.
-[triton-dejavu] First execution including JIT compilation took 0.510263204574585s.
-[triton-dejavu] First execution including JIT compilation took 0.3106980323791504s.
-[triton-dejavu] First execution including JIT compilation took 0.9828391075134277s.
-[triton-dejavu] First execution including JIT compilation took 0.6096630096435547s.
-[triton-dejavu] First execution including JIT compilation took 0.34572386741638184s.
-[triton-dejavu] First execution including JIT compilation took 1.0500340461730957s.
-[triton-dejavu] First execution including JIT compilation took 0.4872100353240967s.
-[triton-dejavu] First execution including JIT compilation took 0.3133056163787842s.
-[triton-dejavu] First execution including JIT compilation took 1.0500223636627197s.
-[triton-dejavu] First execution including JIT compilation took 0.5330610275268555s.
-[triton-dejavu] First execution including JIT compilation took 0.3301053047180176s.
-[triton-dejavu] First execution including JIT compilation took 1.1683359146118164s.
-[triton-dejavu] First execution including JIT compilation took 0.5536503791809082s.
-[triton-dejavu] First execution including JIT compilation took 0.34630656242370605s.
-[triton-dejavu] First execution including JIT compilation took 1.6548552513122559s.
-[triton-dejavu] First execution including JIT compilation took 0.7423355579376221s.
-[triton-dejavu] First execution including JIT compilation took 0.4386255741119385s.
-[triton-dejavu] First execution including JIT compilation took 1.7830908298492432s.
-[triton-dejavu] First execution including JIT compilation took 0.7749922275543213s.
-[triton-dejavu] First execution including JIT compilation took 0.4658083915710449s.
-[triton-dejavu] First execution including JIT compilation took 1.905794382095337s.
-[triton-dejavu] First execution including JIT compilation took 0.816021203994751s.
-[triton-dejavu] First execution including JIT compilation took 0.4723987579345703s.
-[triton-dejavu] First execution including JIT compilation took 1.869170904159546s.
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.789454460144043s.
-[triton-dejavu] First execution including JIT compilation took 0.6909325122833252s.
-[triton-dejavu] First execution including JIT compilation took 0.4053471088409424s.
-[triton-dejavu] First execution including JIT compilation took 1.7492396831512451s.
-[triton-dejavu] First execution including JIT compilation took 0.7165470123291016s.
-[triton-dejavu] First execution including JIT compilation took 0.4185338020324707s.
-[triton-dejavu] First execution including JIT compilation took 2.8342366218566895s.
-[triton-dejavu] First execution including JIT compilation took 0.8270382881164551s.
-[triton-dejavu] First execution including JIT compilation took 0.4604911804199219s.
-[triton-dejavu] First execution including JIT compilation took 2.927734851837158s.
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.0586724281311035s.
-[triton-dejavu] First execution including JIT compilation took 1.5080604553222656s.
-[triton-dejavu] First execution including JIT compilation took 0.6953163146972656s.
-[triton-dejavu] First execution including JIT compilation took 4.226686000823975s.
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 917504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 917504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.3607323169708252s.
-[triton-dejavu] First execution including JIT compilation took 0.7990188598632812s.
-[triton-dejavu] First execution including JIT compilation took 0.39726877212524414s.
-[triton-dejavu] First execution including JIT compilation took 1.393247127532959s.
-[triton-dejavu] First execution including JIT compilation took 0.9832372665405273s.
-[triton-dejavu] First execution including JIT compilation took 0.4763679504394531s.
-[triton-dejavu] First execution including JIT compilation took 1.456979513168335s.
-[triton-dejavu] First execution including JIT compilation took 1.0040147304534912s.
-[triton-dejavu] First execution including JIT compilation took 0.4499683380126953s.
-[triton-dejavu] First execution including JIT compilation took 1.467405080795288s.
-[triton-dejavu] First execution including JIT compilation took 1.0723049640655518s.
-[triton-dejavu] First execution including JIT compilation took 0.49906277656555176s.
-[triton-dejavu] First execution including JIT compilation took 1.524533987045288s.
-[triton-dejavu] First execution including JIT compilation took 1.4248688220977783s.
-[triton-dejavu] First execution including JIT compilation took 0.6042609214782715s.
-[triton-dejavu] First execution including JIT compilation took 1.7416322231292725s.
-[triton-dejavu] First execution including JIT compilation took 1.0214593410491943s.
-[triton-dejavu] First execution including JIT compilation took 0.45897865295410156s.
-[triton-dejavu] First execution including JIT compilation took 1.5276007652282715s.
-[triton-dejavu] First execution including JIT compilation took 1.0185387134552002s.
-[triton-dejavu] First execution including JIT compilation took 0.5293161869049072s.
-[triton-dejavu] First execution including JIT compilation took 1.8517167568206787s.
-[triton-dejavu] First execution including JIT compilation took 0.9630119800567627s.
-[triton-dejavu] First execution including JIT compilation took 0.43575310707092285s.
-[triton-dejavu] First execution including JIT compilation took 1.9177396297454834s.
-[triton-dejavu] First execution including JIT compilation took 1.569082498550415s.
-[triton-dejavu] First execution including JIT compilation took 0.622168779373169s.
-[triton-dejavu] First execution including JIT compilation took 2.339301347732544s.
-[triton-dejavu] First execution including JIT compilation took 1.5994513034820557s.
-[triton-dejavu] First execution including JIT compilation took 0.6422829627990723s.
-[triton-dejavu] First execution including JIT compilation took 2.1358773708343506s.
-[triton-dejavu] First execution including JIT compilation took 1.1553890705108643s.
-[triton-dejavu] First execution including JIT compilation took 0.5729074478149414s.
-[triton-dejavu] First execution including JIT compilation took 1.8737192153930664s.
-[triton-dejavu] First execution including JIT compilation took 1.6270005702972412s.
-[triton-dejavu] First execution including JIT compilation took 0.5927095413208008s.
-[triton-dejavu] First execution including JIT compilation took 1.9137556552886963s.
-[triton-dejavu] First execution including JIT compilation took 1.6627833843231201s.
-[triton-dejavu] First execution including JIT compilation took 0.6282734870910645s.
-[triton-dejavu] First execution including JIT compilation took 2.6357598304748535s.
-[triton-dejavu] First execution including JIT compilation took 1.3591229915618896s.
-[triton-dejavu] First execution including JIT compilation took 0.6953067779541016s.
-[triton-dejavu] First execution including JIT compilation took 2.43611741065979s.
-[triton-dejavu] First execution including JIT compilation took 1.2323598861694336s.
-[triton-dejavu] First execution including JIT compilation took 0.6111257076263428s.
-[triton-dejavu] First execution including JIT compilation took 2.841799259185791s.
-[triton-dejavu] First execution including JIT compilation took 1.360656976699829s.
-[triton-dejavu] First execution including JIT compilation took 0.8137938976287842s.
-[triton-dejavu] First execution including JIT compilation took 3.458110809326172s.
-[triton-dejavu] First execution including JIT compilation took 1.5271718502044678s.
-[triton-dejavu] First execution including JIT compilation took 0.0032939910888671875s.
-[triton-dejavu] First execution including JIT compilation took 2.9182276725769043s.
-[triton-dejavu] First execution including JIT compilation took 1.539180040359497s.
-[triton-dejavu] First execution including JIT compilation took 0.6763615608215332s.
-[triton-dejavu] First execution including JIT compilation took 3.089775562286377s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.155709505081177s.
-[triton-dejavu] First execution including JIT compilation took 2.4426767826080322s.
-[triton-dejavu] First execution including JIT compilation took 1.0379819869995117s.
-[triton-dejavu] First execution including JIT compilation took 4.222529649734497s.
-[triton-dejavu] First execution including JIT compilation took 2.4925472736358643s.
-[triton-dejavu] First execution including JIT compilation took 1.073103666305542s.
-[triton-dejavu] First execution including JIT compilation took 8.762295961380005s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 7.50976037979126s.
-[triton-dejavu] First execution including JIT compilation took 3.327193260192871s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1376256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1376256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.3581666946411133s.
-[triton-dejavu] First execution including JIT compilation took 0.2740662097930908s.
-[triton-dejavu] First execution including JIT compilation took 0.23471784591674805s.
-[triton-dejavu] First execution including JIT compilation took 0.3980753421783447s.
-[triton-dejavu] First execution including JIT compilation took 0.28090405464172363s.
-[triton-dejavu] First execution including JIT compilation took 0.21105456352233887s.
-[triton-dejavu] First execution including JIT compilation took 0.41014552116394043s.
-[triton-dejavu] First execution including JIT compilation took 0.3041348457336426s.
-[triton-dejavu] First execution including JIT compilation took 0.2524149417877197s.
-[triton-dejavu] First execution including JIT compilation took 0.42508769035339355s.
-[triton-dejavu] First execution including JIT compilation took 0.3460569381713867s.
-[triton-dejavu] First execution including JIT compilation took 0.26442742347717285s.
-[triton-dejavu] First execution including JIT compilation took 0.46298742294311523s.
-[triton-dejavu] First execution including JIT compilation took 0.33917737007141113s.
-[triton-dejavu] First execution including JIT compilation took 0.2681269645690918s.
-[triton-dejavu] First execution including JIT compilation took 0.48372411727905273s.
-[triton-dejavu] First execution including JIT compilation took 0.34528517723083496s.
-[triton-dejavu] First execution including JIT compilation took 0.27705836296081543s.
-[triton-dejavu] First execution including JIT compilation took 0.5136411190032959s.
-[triton-dejavu] First execution including JIT compilation took 0.35900115966796875s.
-[triton-dejavu] First execution including JIT compilation took 0.27854084968566895s.
-[triton-dejavu] First execution including JIT compilation took 0.4344968795776367s.
-[triton-dejavu] First execution including JIT compilation took 0.29988908767700195s.
-[triton-dejavu] First execution including JIT compilation took 0.4305758476257324s.
-[triton-dejavu] First execution including JIT compilation took 0.4533987045288086s.
-[triton-dejavu] First execution including JIT compilation took 0.535660982131958s.
-[triton-dejavu] First execution including JIT compilation took 0.2640557289123535s.
-[triton-dejavu] First execution including JIT compilation took 0.7827637195587158s.
-[triton-dejavu] First execution including JIT compilation took 0.4749734401702881s.
-[triton-dejavu] First execution including JIT compilation took 0.28125476837158203s.
-[triton-dejavu] First execution including JIT compilation took 0.5667471885681152s.
-[triton-dejavu] First execution including JIT compilation took 0.3628854751586914s.
-[triton-dejavu] First execution including JIT compilation took 0.27803826332092285s.
-[triton-dejavu] First execution including JIT compilation took 0.5935788154602051s.
-[triton-dejavu] First execution including JIT compilation took 0.37410998344421387s.
-[triton-dejavu] First execution including JIT compilation took 0.315047025680542s.
-[triton-dejavu] First execution including JIT compilation took 0.6383876800537109s.
-[triton-dejavu] First execution including JIT compilation took 0.39231395721435547s.
-[triton-dejavu] First execution including JIT compilation took 0.4904477596282959s.
-[triton-dejavu] First execution including JIT compilation took 0.7176785469055176s.
-[triton-dejavu] First execution including JIT compilation took 0.8923492431640625s.
-[triton-dejavu] First execution including JIT compilation took 0.37270665168762207s.
-[triton-dejavu] First execution including JIT compilation took 0.7702887058258057s.
-[triton-dejavu] First execution including JIT compilation took 0.39134764671325684s.
-[triton-dejavu] First execution including JIT compilation took 0.27783751487731934s.
-[triton-dejavu] First execution including JIT compilation took 0.7178552150726318s.
-[triton-dejavu] First execution including JIT compilation took 0.5033924579620361s.
-[triton-dejavu] First execution including JIT compilation took 0.29184746742248535s.
-[triton-dejavu] First execution including JIT compilation took 1.4417552947998047s.
-[triton-dejavu] First execution including JIT compilation took 0.455214262008667s.
-[triton-dejavu] First execution including JIT compilation took 0.32988977432250977s.
-[triton-dejavu] First execution including JIT compilation took 1.2478272914886475s.
-[triton-dejavu] First execution including JIT compilation took 0.8535811901092529s.
-[triton-dejavu] First execution including JIT compilation took 0.4271533489227295s.
-[triton-dejavu] First execution including JIT compilation took 1.5763370990753174s.
-[triton-dejavu] First execution including JIT compilation took 0.6108431816101074s.
-[triton-dejavu] First execution including JIT compilation took 0.3532085418701172s.
-[triton-dejavu] First execution including JIT compilation took 1.6405696868896484s.
-[triton-dejavu] First execution including JIT compilation took 0.5536832809448242s.
-[triton-dejavu] First execution including JIT compilation took 0.36753392219543457s.
-bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.3316895961761475s.
-[triton-dejavu] First execution including JIT compilation took 0.578660249710083s.
-[triton-dejavu] First execution including JIT compilation took 0.3567483425140381s.
-[triton-dejavu] First execution including JIT compilation took 1.5676116943359375s.
-[triton-dejavu] First execution including JIT compilation took 0.5794088840484619s.
-[triton-dejavu] First execution including JIT compilation took 0.3735392093658447s.
-[triton-dejavu] First execution including JIT compilation took 5.502956390380859s.
-[triton-dejavu] First execution including JIT compilation took 1.0270774364471436s.
-[triton-dejavu] First execution including JIT compilation took 0.442889928817749s.
-[triton-dejavu] First execution including JIT compilation took 5.70585036277771s.
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 487424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 3.93192195892334s.
-[triton-dejavu] First execution including JIT compilation took 1.0750982761383057s.
-[triton-dejavu] First execution including JIT compilation took 0.5941033363342285s.
-[triton-dejavu] First execution including JIT compilation took 4.812488079071045s.
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 557056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 696320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 835584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 835584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 835584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 835584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 974848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1114112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1114112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1114112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1114112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.5393879413604736s.
-[triton-dejavu] First execution including JIT compilation took 0.36963605880737305s.
-[triton-dejavu] First execution including JIT compilation took 0.3970627784729004s.
-[triton-dejavu] First execution including JIT compilation took 0.4841430187225342s.
-[triton-dejavu] First execution including JIT compilation took 0.3199918270111084s.
-[triton-dejavu] First execution including JIT compilation took 0.2800755500793457s.
-[triton-dejavu] First execution including JIT compilation took 0.5644237995147705s.
-[triton-dejavu] First execution including JIT compilation took 0.3869204521179199s.
-[triton-dejavu] First execution including JIT compilation took 0.4037020206451416s.
-[triton-dejavu] First execution including JIT compilation took 0.5500500202178955s.
-[triton-dejavu] First execution including JIT compilation took 0.3945121765136719s.
-[triton-dejavu] First execution including JIT compilation took 0.3146946430206299s.
-[triton-dejavu] First execution including JIT compilation took 0.5734715461730957s.
-[triton-dejavu] First execution including JIT compilation took 0.5372509956359863s.
-[triton-dejavu] First execution including JIT compilation took 0.3640165328979492s.
-[triton-dejavu] First execution including JIT compilation took 0.6109771728515625s.
-[triton-dejavu] First execution including JIT compilation took 0.4634361267089844s.
-[triton-dejavu] First execution including JIT compilation took 0.4206717014312744s.
-[triton-dejavu] First execution including JIT compilation took 1.0486819744110107s.
-[triton-dejavu] First execution including JIT compilation took 0.44484424591064453s.
-[triton-dejavu] First execution including JIT compilation took 0.3491060733795166s.
-[triton-dejavu] First execution including JIT compilation took 0.7697179317474365s.
-[triton-dejavu] First execution including JIT compilation took 0.3961319923400879s.
-[triton-dejavu] First execution including JIT compilation took 0.3008708953857422s.
-[triton-dejavu] First execution including JIT compilation took 0.6616361141204834s.
-[triton-dejavu] First execution including JIT compilation took 0.45753026008605957s.
-[triton-dejavu] First execution including JIT compilation took 0.3097813129425049s.
-[triton-dejavu] First execution including JIT compilation took 0.7761518955230713s.
-[triton-dejavu] First execution including JIT compilation took 0.5004098415374756s.
-[triton-dejavu] First execution including JIT compilation took 0.3134744167327881s.
-[triton-dejavu] First execution including JIT compilation took 0.7714171409606934s.
-[triton-dejavu] First execution including JIT compilation took 0.7993361949920654s.
-[triton-dejavu] First execution including JIT compilation took 0.34277820587158203s.
-[triton-dejavu] First execution including JIT compilation took 0.808971643447876s.
-[triton-dejavu] First execution including JIT compilation took 0.4371776580810547s.
-[triton-dejavu] First execution including JIT compilation took 0.31221866607666016s.
-[triton-dejavu] First execution including JIT compilation took 0.6809587478637695s.
-[triton-dejavu] First execution including JIT compilation took 0.40524864196777344s.
-[triton-dejavu] First execution including JIT compilation took 0.49398159980773926s.
-[triton-dejavu] First execution including JIT compilation took 0.7367451190948486s.
-[triton-dejavu] First execution including JIT compilation took 0.7439749240875244s.
-[triton-dejavu] First execution including JIT compilation took 0.3696317672729492s.
-[triton-dejavu] First execution including JIT compilation took 1.1181640625s.
-[triton-dejavu] First execution including JIT compilation took 0.4313173294067383s.
-[triton-dejavu] First execution including JIT compilation took 0.297299861907959s.
-[triton-dejavu] First execution including JIT compilation took 0.8869140148162842s.
-[triton-dejavu] First execution including JIT compilation took 0.48682713508605957s.
-[triton-dejavu] First execution including JIT compilation took 0.3501567840576172s.
-[triton-dejavu] First execution including JIT compilation took 1.4581646919250488s.
-[triton-dejavu] First execution including JIT compilation took 0.5649135112762451s.
-[triton-dejavu] First execution including JIT compilation took 0.3721659183502197s.
-[triton-dejavu] First execution including JIT compilation took 1.5119690895080566s.
-[triton-dejavu] First execution including JIT compilation took 0.5899574756622314s.
-[triton-dejavu] First execution including JIT compilation took 0.3819904327392578s.
-[triton-dejavu] First execution including JIT compilation took 1.6209561824798584s.
-[triton-dejavu] First execution including JIT compilation took 0.6263985633850098s.
-[triton-dejavu] First execution including JIT compilation took 0.38887882232666016s.
-[triton-dejavu] First execution including JIT compilation took 1.7282218933105469s.
-[triton-dejavu] First execution including JIT compilation took 0.6377005577087402s.
-[triton-dejavu] First execution including JIT compilation took 0.4078361988067627s.
-bench_cudagraph failed with out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 258048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.6659209728240967s.
-[triton-dejavu] First execution including JIT compilation took 0.9628505706787109s.
-[triton-dejavu] First execution including JIT compilation took 0.4381530284881592s.
-[triton-dejavu] First execution including JIT compilation took 1.6766464710235596s.
-[triton-dejavu] First execution including JIT compilation took 0.7337453365325928s.
-[triton-dejavu] First execution including JIT compilation took 0.673093318939209s.
-[triton-dejavu] First execution including JIT compilation took 7.029362678527832s.
-[triton-dejavu] First execution including JIT compilation took 1.219388484954834s.
-[triton-dejavu] First execution including JIT compilation took 0.8028266429901123s.
-[triton-dejavu] First execution including JIT compilation took 6.798900127410889s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 516096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.058229684829712s.
-[triton-dejavu] First execution including JIT compilation took 1.5925123691558838s.
-[triton-dejavu] First execution including JIT compilation took 0.6987450122833252s.
-[triton-dejavu] First execution including JIT compilation took 5.16088080406189s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 442368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 737280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 884736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 884736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 884736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 884736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1032192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.8672266006469727s.
-[triton-dejavu] First execution including JIT compilation took 0.4460761547088623s.
-[triton-dejavu] First execution including JIT compilation took 0.2941617965698242s.
-[triton-dejavu] First execution including JIT compilation took 0.7295150756835938s.
-[triton-dejavu] First execution including JIT compilation took 0.418994665145874s.
-[triton-dejavu] First execution including JIT compilation took 0.3657665252685547s.
-[triton-dejavu] First execution including JIT compilation took 0.8384854793548584s.
-[triton-dejavu] First execution including JIT compilation took 0.5150895118713379s.
-[triton-dejavu] First execution including JIT compilation took 0.3905613422393799s.
-[triton-dejavu] First execution including JIT compilation took 0.9925787448883057s.
-[triton-dejavu] First execution including JIT compilation took 0.5739874839782715s.
-[triton-dejavu] First execution including JIT compilation took 0.3871347904205322s.
-[triton-dejavu] First execution including JIT compilation took 1.0117156505584717s.
-[triton-dejavu] First execution including JIT compilation took 0.5855865478515625s.
-[triton-dejavu] First execution including JIT compilation took 0.3940417766571045s.
-[triton-dejavu] First execution including JIT compilation took 1.0509228706359863s.
-[triton-dejavu] First execution including JIT compilation took 0.6001262664794922s.
-[triton-dejavu] First execution including JIT compilation took 0.4003324508666992s.
-[triton-dejavu] First execution including JIT compilation took 1.0769095420837402s.
-[triton-dejavu] First execution including JIT compilation took 0.6213738918304443s.
-[triton-dejavu] First execution including JIT compilation took 0.4390685558319092s.
-[triton-dejavu] First execution including JIT compilation took 1.0989954471588135s.
-[triton-dejavu] First execution including JIT compilation took 0.538212776184082s.
-[triton-dejavu] First execution including JIT compilation took 0.34635400772094727s.
-[triton-dejavu] First execution including JIT compilation took 1.1051290035247803s.
-[triton-dejavu] First execution including JIT compilation took 0.6306774616241455s.
-[triton-dejavu] First execution including JIT compilation took 0.3778243064880371s.
-[triton-dejavu] First execution including JIT compilation took 1.2004315853118896s.
-[triton-dejavu] First execution including JIT compilation took 0.8137209415435791s.
-[triton-dejavu] First execution including JIT compilation took 0.38579225540161133s.
-[triton-dejavu] First execution including JIT compilation took 1.2420098781585693s.
-[triton-dejavu] First execution including JIT compilation took 0.6466991901397705s.
-[triton-dejavu] First execution including JIT compilation took 0.36069154739379883s.
-[triton-dejavu] First execution including JIT compilation took 1.1935985088348389s.
-[triton-dejavu] First execution including JIT compilation took 0.7577130794525146s.
-[triton-dejavu] First execution including JIT compilation took 0.4502859115600586s.
-[triton-dejavu] First execution including JIT compilation took 1.3447489738464355s.
-[triton-dejavu] First execution including JIT compilation took 0.9920356273651123s.
-[triton-dejavu] First execution including JIT compilation took 0.431868314743042s.
-[triton-dejavu] First execution including JIT compilation took 1.4089694023132324s.
-[triton-dejavu] First execution including JIT compilation took 0.6866104602813721s.
-[triton-dejavu] First execution including JIT compilation took 0.445110559463501s.
-[triton-dejavu] First execution including JIT compilation took 1.3942172527313232s.
-[triton-dejavu] First execution including JIT compilation took 0.5355641841888428s.
-[triton-dejavu] First execution including JIT compilation took 0.32257676124572754s.
-[triton-dejavu] First execution including JIT compilation took 1.2880756855010986s.
-[triton-dejavu] First execution including JIT compilation took 0.6331043243408203s.
-[triton-dejavu] First execution including JIT compilation took 0.3692941665649414s.
-[triton-dejavu] First execution including JIT compilation took 1.7392678260803223s.
-[triton-dejavu] First execution including JIT compilation took 0.649709939956665s.
-[triton-dejavu] First execution including JIT compilation took 0.38323354721069336s.
-[triton-dejavu] First execution including JIT compilation took 1.8752937316894531s.
-[triton-dejavu] First execution including JIT compilation took 0.724346399307251s.
-[triton-dejavu] First execution including JIT compilation took 0.37693119049072266s.
-[triton-dejavu] First execution including JIT compilation took 2.0627846717834473s.
-[triton-dejavu] First execution including JIT compilation took 0.7618973255157471s.
-[triton-dejavu] First execution including JIT compilation took 0.4468967914581299s.
-[triton-dejavu] First execution including JIT compilation took 2.021761178970337s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.558701276779175s.
-[triton-dejavu] First execution including JIT compilation took 0.8006973266601562s.
-[triton-dejavu] First execution including JIT compilation took 0.4271361827850342s.
-[triton-dejavu] First execution including JIT compilation took 2.6522200107574463s.
-[triton-dejavu] First execution including JIT compilation took 0.860508918762207s.
-[triton-dejavu] First execution including JIT compilation took 0.4831836223602295s.
-[triton-dejavu] First execution including JIT compilation took 7.42484712600708s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 409600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 573440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 6.010982990264893s.
-[triton-dejavu] First execution including JIT compilation took 1.7582054138183594s.
-[triton-dejavu] First execution including JIT compilation took 1.0528242588043213s.
-[triton-dejavu] First execution including JIT compilation took 6.84581995010376s.
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 819200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1146880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1146880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.7142207622528076s.
-[triton-dejavu] First execution including JIT compilation took 0.8741211891174316s.
-[triton-dejavu] First execution including JIT compilation took 0.44619178771972656s.
-[triton-dejavu] First execution including JIT compilation took 1.8718609809875488s.
-[triton-dejavu] First execution including JIT compilation took 0.9042544364929199s.
-[triton-dejavu] First execution including JIT compilation took 0.4581465721130371s.
-[triton-dejavu] First execution including JIT compilation took 2.1042685508728027s.
-[triton-dejavu] First execution including JIT compilation took 0.908367395401001s.
-[triton-dejavu] First execution including JIT compilation took 0.48277711868286133s.
-[triton-dejavu] First execution including JIT compilation took 1.7529594898223877s.
-[triton-dejavu] First execution including JIT compilation took 0.9210634231567383s.
-[triton-dejavu] First execution including JIT compilation took 0.5785129070281982s.
-[triton-dejavu] First execution including JIT compilation took 1.9719526767730713s.
-[triton-dejavu] First execution including JIT compilation took 0.926983118057251s.
-[triton-dejavu] First execution including JIT compilation took 0.47329115867614746s.
-[triton-dejavu] First execution including JIT compilation took 1.8675498962402344s.
-[triton-dejavu] First execution including JIT compilation took 0.8849301338195801s.
-[triton-dejavu] First execution including JIT compilation took 0.4898045063018799s.
-[triton-dejavu] First execution including JIT compilation took 1.819542407989502s.
-[triton-dejavu] First execution including JIT compilation took 0.981731653213501s.
-[triton-dejavu] First execution including JIT compilation took 0.5096790790557861s.
-[triton-dejavu] First execution including JIT compilation took 2.11425518989563s.
-[triton-dejavu] First execution including JIT compilation took 0.837721586227417s.
-[triton-dejavu] First execution including JIT compilation took 0.4882984161376953s.
-[triton-dejavu] First execution including JIT compilation took 2.053067922592163s.
-[triton-dejavu] First execution including JIT compilation took 0.897794246673584s.
-[triton-dejavu] First execution including JIT compilation took 0.4767446517944336s.
-[triton-dejavu] First execution including JIT compilation took 2.07883358001709s.
-[triton-dejavu] First execution including JIT compilation took 1.0238347053527832s.
-[triton-dejavu] First execution including JIT compilation took 0.6266560554504395s.
-[triton-dejavu] First execution including JIT compilation took 2.814924478530884s.
-[triton-dejavu] First execution including JIT compilation took 1.255967378616333s.
-[triton-dejavu] First execution including JIT compilation took 0.680903434753418s.
-[triton-dejavu] First execution including JIT compilation took 2.395393133163452s.
-[triton-dejavu] First execution including JIT compilation took 1.0010457038879395s.
-[triton-dejavu] First execution including JIT compilation took 0.6347818374633789s.
-[triton-dejavu] First execution including JIT compilation took 2.7960519790649414s.
-[triton-dejavu] First execution including JIT compilation took 1.0326149463653564s.
-[triton-dejavu] First execution including JIT compilation took 0.5450241565704346s.
-[triton-dejavu] First execution including JIT compilation took 2.445779800415039s.
-[triton-dejavu] First execution including JIT compilation took 1.0319764614105225s.
-[triton-dejavu] First execution including JIT compilation took 0.6632704734802246s.
-[triton-dejavu] First execution including JIT compilation took 2.80086088180542s.
-[triton-dejavu] First execution including JIT compilation took 1.1742348670959473s.
-[triton-dejavu] First execution including JIT compilation took 0.5098991394042969s.
-[triton-dejavu] First execution including JIT compilation took 2.790087938308716s.
-[triton-dejavu] First execution including JIT compilation took 1.1971583366394043s.
-[triton-dejavu] First execution including JIT compilation took 0.5753312110900879s.
-[triton-dejavu] First execution including JIT compilation took 3.8199825286865234s.
-[triton-dejavu] First execution including JIT compilation took 1.5596168041229248s.
-[triton-dejavu] First execution including JIT compilation took 0.7234528064727783s.
-[triton-dejavu] First execution including JIT compilation took 3.8001348972320557s.
-[triton-dejavu] First execution including JIT compilation took 1.3300747871398926s.
-[triton-dejavu] First execution including JIT compilation took 0.8064060211181641s.
-[triton-dejavu] First execution including JIT compilation took 3.833221673965454s.
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 245760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.838615655899048s.
-[triton-dejavu] First execution including JIT compilation took 1.5911104679107666s.
-[triton-dejavu] First execution including JIT compilation took 0.7249307632446289s.
-[triton-dejavu] First execution including JIT compilation took 5.080144166946411s.
-[triton-dejavu] First execution including JIT compilation took 1.7896246910095215s.
-[triton-dejavu] First execution including JIT compilation took 0.7319927215576172s.
-[triton-dejavu] First execution including JIT compilation took 10.777840614318848s.
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 491520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 688128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 5.033360242843628s.
-[triton-dejavu] First execution including JIT compilation took 1.410045862197876s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 589824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 983040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1179648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1376256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1376256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.680861473083496s.
-[triton-dejavu] First execution including JIT compilation took 2.364461898803711s.
-[triton-dejavu] First execution including JIT compilation took 0.8534502983093262s.
-[triton-dejavu] First execution including JIT compilation took 4.708017349243164s.
-[triton-dejavu] First execution including JIT compilation took 2.841503858566284s.
-[triton-dejavu] First execution including JIT compilation took 1.0484719276428223s.
-[triton-dejavu] First execution including JIT compilation took 4.7807886600494385s.
-[triton-dejavu] First execution including JIT compilation took 2.8980062007904053s.
-[triton-dejavu] First execution including JIT compilation took 1.0707988739013672s.
-[triton-dejavu] First execution including JIT compilation took 4.607600212097168s.
-[triton-dejavu] First execution including JIT compilation took 2.8636832237243652s.
-[triton-dejavu] First execution including JIT compilation took 1.1431879997253418s.
-[triton-dejavu] First execution including JIT compilation took 4.923970699310303s.
-[triton-dejavu] First execution including JIT compilation took 2.79614520072937s.
-[triton-dejavu] First execution including JIT compilation took 1.0749492645263672s.
-[triton-dejavu] First execution including JIT compilation took 4.696893692016602s.
-[triton-dejavu] First execution including JIT compilation took 2.8622703552246094s.
-[triton-dejavu] First execution including JIT compilation took 1.0982391834259033s.
-[triton-dejavu] First execution including JIT compilation took 4.7404444217681885s.
-[triton-dejavu] First execution including JIT compilation took 2.878173828125s.
-[triton-dejavu] First execution including JIT compilation took 1.1065995693206787s.
-[triton-dejavu] First execution including JIT compilation took 4.991016626358032s.
-[triton-dejavu] First execution including JIT compilation took 2.5021591186523438s.
-[triton-dejavu] First execution including JIT compilation took 0.9695248603820801s.
-[triton-dejavu] First execution including JIT compilation took 5.3018670082092285s.
-[triton-dejavu] First execution including JIT compilation took 3.273489236831665s.
-[triton-dejavu] First execution including JIT compilation took 1.181260108947754s.
-[triton-dejavu] First execution including JIT compilation took 5.431257247924805s.
-[triton-dejavu] First execution including JIT compilation took 3.352473497390747s.
-[triton-dejavu] First execution including JIT compilation took 1.186856985092163s.
-[triton-dejavu] First execution including JIT compilation took 5.393920183181763s.
-[triton-dejavu] First execution including JIT compilation took 3.40191650390625s.
-[triton-dejavu] First execution including JIT compilation took 1.1941492557525635s.
-[triton-dejavu] First execution including JIT compilation took 5.543420314788818s.
-[triton-dejavu] First execution including JIT compilation took 3.3016717433929443s.
-[triton-dejavu] First execution including JIT compilation took 1.2081632614135742s.
-[triton-dejavu] First execution including JIT compilation took 5.640880107879639s.
-[triton-dejavu] First execution including JIT compilation took 3.5443694591522217s.
-[triton-dejavu] First execution including JIT compilation took 1.3958439826965332s.
-[triton-dejavu] First execution including JIT compilation took 5.6015305519104s.
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 6.935962200164795s.
-[triton-dejavu] First execution including JIT compilation took 3.3080406188964844s.
-[triton-dejavu] First execution including JIT compilation took 1.2709336280822754s.
-[triton-dejavu] First execution including JIT compilation took 7.072402715682983s.
-[triton-dejavu] First execution including JIT compilation took 3.7861485481262207s.
-[triton-dejavu] First execution including JIT compilation took 1.4361011981964111s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 5.464360475540161s.
-[triton-dejavu] First execution including JIT compilation took 1.6160335540771484s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 327680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 458752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 5.580164670944214s.
-[triton-dejavu] First execution including JIT compilation took 2.2763874530792236s.
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 393216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 655360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 917504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 917504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 524288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 786432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1048576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1310720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1572864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1835008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1835008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2097152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2097152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2097152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2097152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] added BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None for _bmm_chunk_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-25b6b5e18b4b4e9d94bc6cfc6e07052ef952503581ca3a6592f943790d859cd8/tune_features-b815cf0dca1de8dc8520ba45f9861122ec38d2b40655a5044d5da8dee5b249cf/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default and key ('256', '128', 'False', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.int32')
-[2025-07-23 17:21:31] Triton autotuning for function _bmm_chunk_fwd_kernel finished after 10756.57s; best config selected: BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 32, BLOCK_SIZE_K: 64, num_warps: 4, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None with benchmark time 0.002230335958302021;  evaluated 2625 configurations;
-[triton-dejavu] ('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16') not in cache, starting to tune...
-[triton-dejavu] [2025-07-23 17:21:31]  Started benchmarking of 2625 configurations... (use_bo: False, run: 0)
-[triton-dejavu] First execution including JIT compilation took 0.30918288230895996s.
-[triton-dejavu] First execution including JIT compilation took 0.2933952808380127s.
-[triton-dejavu] First execution including JIT compilation took 0.25191783905029297s.
-[triton-dejavu] First execution including JIT compilation took 0.3722970485687256s.
-[triton-dejavu] First execution including JIT compilation took 0.29223203659057617s.
-[triton-dejavu] First execution including JIT compilation took 0.2751960754394531s.
-[triton-dejavu] First execution including JIT compilation took 0.33064961433410645s.
-[triton-dejavu] First execution including JIT compilation took 0.3094618320465088s.
-[triton-dejavu] First execution including JIT compilation took 0.28058433532714844s.
-[triton-dejavu] First execution including JIT compilation took 0.4254286289215088s.
-[triton-dejavu] First execution including JIT compilation took 0.40548038482666016s.
-[triton-dejavu] First execution including JIT compilation took 0.3699519634246826s.
-[triton-dejavu] First execution including JIT compilation took 0.44558167457580566s.
-[triton-dejavu] First execution including JIT compilation took 0.4306924343109131s.
-[triton-dejavu] First execution including JIT compilation took 0.39330148696899414s.
-[triton-dejavu] First execution including JIT compilation took 0.46187448501586914s.
-[triton-dejavu] First execution including JIT compilation took 0.42760252952575684s.
-[triton-dejavu] First execution including JIT compilation took 0.3771791458129883s.
-[triton-dejavu] First execution including JIT compilation took 0.4732646942138672s.
-[triton-dejavu] First execution including JIT compilation took 0.45160865783691406s.
-[triton-dejavu] First execution including JIT compilation took 0.4094536304473877s.
-[triton-dejavu] First execution including JIT compilation took 0.3954737186431885s.
-[triton-dejavu] First execution including JIT compilation took 0.3627440929412842s.
-[triton-dejavu] First execution including JIT compilation took 0.3488888740539551s.
-[triton-dejavu] First execution including JIT compilation took 0.43737292289733887s.
-[triton-dejavu] First execution including JIT compilation took 0.4030463695526123s.
-[triton-dejavu] First execution including JIT compilation took 0.3850533962249756s.
-[triton-dejavu] First execution including JIT compilation took 0.5620872974395752s.
-[triton-dejavu] First execution including JIT compilation took 0.43325042724609375s.
-[triton-dejavu] First execution including JIT compilation took 0.39774608612060547s.
-[triton-dejavu] First execution including JIT compilation took 0.4676651954650879s.
-[triton-dejavu] First execution including JIT compilation took 0.4356362819671631s.
-[triton-dejavu] First execution including JIT compilation took 0.601660966873169s.
-[triton-dejavu] First execution including JIT compilation took 0.4965968132019043s.
-[triton-dejavu] First execution including JIT compilation took 0.43747496604919434s.
-[triton-dejavu] First execution including JIT compilation took 0.4154806137084961s.
-[triton-dejavu] First execution including JIT compilation took 0.4810163974761963s.
-[triton-dejavu] First execution including JIT compilation took 0.5712161064147949s.
-[triton-dejavu] First execution including JIT compilation took 0.42205166816711426s.
-[triton-dejavu] First execution including JIT compilation took 0.5164515972137451s.
-[triton-dejavu] First execution including JIT compilation took 0.46834373474121094s.
-[triton-dejavu] First execution including JIT compilation took 0.6180143356323242s.
-[triton-dejavu] First execution including JIT compilation took 0.44220447540283203s.
-[triton-dejavu] First execution including JIT compilation took 0.38592958450317383s.
-[triton-dejavu] First execution including JIT compilation took 0.36253952980041504s.
-[triton-dejavu] First execution including JIT compilation took 0.5079498291015625s.
-[triton-dejavu] First execution including JIT compilation took 0.42972564697265625s.
-[triton-dejavu] First execution including JIT compilation took 0.44005632400512695s.
-[triton-dejavu] First execution including JIT compilation took 0.5287299156188965s.
-[triton-dejavu] First execution including JIT compilation took 0.44796276092529297s.
-[triton-dejavu] First execution including JIT compilation took 0.4164867401123047s.
-[triton-dejavu] First execution including JIT compilation took 0.6508886814117432s.
-[triton-dejavu] First execution including JIT compilation took 0.45914721488952637s.
-[triton-dejavu] First execution including JIT compilation took 0.41272830963134766s.
-[triton-dejavu] First execution including JIT compilation took 0.5432581901550293s.
-[triton-dejavu] First execution including JIT compilation took 0.47520899772644043s.
-[triton-dejavu] First execution including JIT compilation took 0.6470954418182373s.
-[triton-dejavu] First execution including JIT compilation took 0.5783417224884033s.
-[triton-dejavu] First execution including JIT compilation took 0.4790058135986328s.
-[triton-dejavu] First execution including JIT compilation took 0.4519171714782715s.
-[triton-dejavu] First execution including JIT compilation took 0.6798951625823975s.
-[triton-dejavu] First execution including JIT compilation took 0.5275766849517822s.
-[triton-dejavu] First execution including JIT compilation took 0.4806857109069824s.
-[triton-dejavu] First execution including JIT compilation took 0.5353171825408936s.
-[triton-dejavu] First execution including JIT compilation took 0.46978282928466797s.
-[triton-dejavu] First execution including JIT compilation took 0.39371633529663086s.
-[triton-dejavu] First execution including JIT compilation took 0.5798892974853516s.
-[triton-dejavu] First execution including JIT compilation took 0.46941256523132324s.
-[triton-dejavu] First execution including JIT compilation took 0.42421650886535645s.
-[triton-dejavu] First execution including JIT compilation took 0.7001688480377197s.
-[triton-dejavu] First execution including JIT compilation took 0.48505401611328125s.
-[triton-dejavu] First execution including JIT compilation took 0.43085551261901855s.
-[triton-dejavu] First execution including JIT compilation took 0.6240184307098389s.
-[triton-dejavu] First execution including JIT compilation took 0.5429472923278809s.
-[triton-dejavu] First execution including JIT compilation took 0.456082820892334s.
-[triton-dejavu] First execution including JIT compilation took 0.6970863342285156s.
-[triton-dejavu] First execution including JIT compilation took 0.5338778495788574s.
-[triton-dejavu] First execution including JIT compilation took 0.5650749206542969s.
-[triton-dejavu] First execution including JIT compilation took 0.7317397594451904s.
-[triton-dejavu] First execution including JIT compilation took 0.5815334320068359s.
-[triton-dejavu] First execution including JIT compilation took 0.5088152885437012s.
-[triton-dejavu] First execution including JIT compilation took 0.8137404918670654s.
-[triton-dejavu] First execution including JIT compilation took 0.6120672225952148s.
-[triton-dejavu] First execution including JIT compilation took 0.5259246826171875s.
-[triton-dejavu] First execution including JIT compilation took 0.7127907276153564s.
-[triton-dejavu] First execution including JIT compilation took 0.5280823707580566s.
-[triton-dejavu] First execution including JIT compilation took 0.44465160369873047s.
-[triton-dejavu] First execution including JIT compilation took 0.8294477462768555s.
-[triton-dejavu] First execution including JIT compilation took 0.5809340476989746s.
-[triton-dejavu] First execution including JIT compilation took 0.5024135112762451s.
-[triton-dejavu] First execution including JIT compilation took 0.8774230480194092s.
-[triton-dejavu] First execution including JIT compilation took 0.7163739204406738s.
-[triton-dejavu] First execution including JIT compilation took 0.49521970748901367s.
-[triton-dejavu] First execution including JIT compilation took 0.9411158561706543s.
-[triton-dejavu] First execution including JIT compilation took 0.681549072265625s.
-[triton-dejavu] First execution including JIT compilation took 0.7356657981872559s.
-[triton-dejavu] First execution including JIT compilation took 1.034416913986206s.
-[triton-dejavu] First execution including JIT compilation took 0.731208324432373s.
-[triton-dejavu] First execution including JIT compilation took 0.5860607624053955s.
-[triton-dejavu] First execution including JIT compilation took 1.1468729972839355s.
-[triton-dejavu] First execution including JIT compilation took 0.7314703464508057s.
-[triton-dejavu] First execution including JIT compilation took 0.5886971950531006s.
-[triton-dejavu] First execution including JIT compilation took 1.10198974609375s.
-[triton-dejavu] First execution including JIT compilation took 0.7672359943389893s.
-[triton-dejavu] First execution including JIT compilation took 0.6245870590209961s.
-[triton-dejavu] First execution including JIT compilation took 0.4356365203857422s.
-[triton-dejavu] First execution including JIT compilation took 0.33637166023254395s.
-[triton-dejavu] First execution including JIT compilation took 0.32270336151123047s.
-[triton-dejavu] First execution including JIT compilation took 0.43533897399902344s.
-[triton-dejavu] First execution including JIT compilation took 0.37600111961364746s.
-[triton-dejavu] First execution including JIT compilation took 0.39075493812561035s.
-[triton-dejavu] First execution including JIT compilation took 0.5955746173858643s.
-[triton-dejavu] First execution including JIT compilation took 0.420551061630249s.
-[triton-dejavu] First execution including JIT compilation took 0.38468456268310547s.
-[triton-dejavu] First execution including JIT compilation took 0.48545384407043457s.
-[triton-dejavu] First execution including JIT compilation took 0.43158817291259766s.
-[triton-dejavu] First execution including JIT compilation took 0.42005348205566406s.
-[triton-dejavu] First execution including JIT compilation took 0.5997962951660156s.
-[triton-dejavu] First execution including JIT compilation took 0.4429283142089844s.
-[triton-dejavu] First execution including JIT compilation took 0.40537381172180176s.
-[triton-dejavu] First execution including JIT compilation took 0.5108773708343506s.
-[triton-dejavu] First execution including JIT compilation took 0.4490795135498047s.
-[triton-dejavu] First execution including JIT compilation took 0.4208858013153076s.
-[triton-dejavu] First execution including JIT compilation took 0.6792380809783936s.
-[triton-dejavu] First execution including JIT compilation took 0.467818021774292s.
-[triton-dejavu] First execution including JIT compilation took 0.4417719841003418s.
-[triton-dejavu] First execution including JIT compilation took 0.4436028003692627s.
-[triton-dejavu] First execution including JIT compilation took 0.3732438087463379s.
-[triton-dejavu] First execution including JIT compilation took 0.3606081008911133s.
-[triton-dejavu] First execution including JIT compilation took 0.4783363342285156s.
-[triton-dejavu] First execution including JIT compilation took 0.40464305877685547s.
-[triton-dejavu] First execution including JIT compilation took 0.38185811042785645s.
-[triton-dejavu] First execution including JIT compilation took 0.5133819580078125s.
-[triton-dejavu] First execution including JIT compilation took 0.43381595611572266s.
-[triton-dejavu] First execution including JIT compilation took 0.42664098739624023s.
-[triton-dejavu] First execution including JIT compilation took 0.5179893970489502s.
-[triton-dejavu] First execution including JIT compilation took 0.46022605895996094s.
-[triton-dejavu] First execution including JIT compilation took 0.4134035110473633s.
-[triton-dejavu] First execution including JIT compilation took 0.5401298999786377s.
-[triton-dejavu] First execution including JIT compilation took 0.4478724002838135s.
-[triton-dejavu] First execution including JIT compilation took 0.42383623123168945s.
-[triton-dejavu] First execution including JIT compilation took 0.5332937240600586s.
-[triton-dejavu] First execution including JIT compilation took 0.49991607666015625s.
-[triton-dejavu] First execution including JIT compilation took 0.41617631912231445s.
-[triton-dejavu] First execution including JIT compilation took 0.601402759552002s.
-[triton-dejavu] First execution including JIT compilation took 0.4842853546142578s.
-[triton-dejavu] First execution including JIT compilation took 0.4743378162384033s.
-[triton-dejavu] First execution including JIT compilation took 0.4953763484954834s.
-[triton-dejavu] First execution including JIT compilation took 0.4030179977416992s.
-[triton-dejavu] First execution including JIT compilation took 0.37740230560302734s.
-[triton-dejavu] First execution including JIT compilation took 0.5496475696563721s.
-[triton-dejavu] First execution including JIT compilation took 0.4513847827911377s.
-[triton-dejavu] First execution including JIT compilation took 0.43414807319641113s.
-[triton-dejavu] First execution including JIT compilation took 0.5767252445220947s.
-[triton-dejavu] First execution including JIT compilation took 0.4709329605102539s.
-[triton-dejavu] First execution including JIT compilation took 0.41281580924987793s.
-[triton-dejavu] First execution including JIT compilation took 0.6148693561553955s.
-[triton-dejavu] First execution including JIT compilation took 0.6631579399108887s.
-[triton-dejavu] First execution including JIT compilation took 0.42415881156921387s.
-[triton-dejavu] First execution including JIT compilation took 0.6116061210632324s.
-[triton-dejavu] First execution including JIT compilation took 0.5052735805511475s.
-[triton-dejavu] First execution including JIT compilation took 0.42380595207214355s.
-[triton-dejavu] First execution including JIT compilation took 0.9918599128723145s.
-[triton-dejavu] First execution including JIT compilation took 0.4996817111968994s.
-[triton-dejavu] First execution including JIT compilation took 0.44182300567626953s.
-[triton-dejavu] First execution including JIT compilation took 0.69724440574646s.
-[triton-dejavu] First execution including JIT compilation took 0.6008899211883545s.
-[triton-dejavu] First execution including JIT compilation took 0.48003244400024414s.
-[triton-dejavu] First execution including JIT compilation took 0.6034128665924072s.
-[triton-dejavu] First execution including JIT compilation took 0.49086999893188477s.
-[triton-dejavu] First execution including JIT compilation took 0.40519237518310547s.
-[triton-dejavu] First execution including JIT compilation took 0.6755588054656982s.
-[triton-dejavu] First execution including JIT compilation took 0.48955368995666504s.
-[triton-dejavu] First execution including JIT compilation took 0.4392104148864746s.
-[triton-dejavu] First execution including JIT compilation took 0.7415237426757812s.
-[triton-dejavu] First execution including JIT compilation took 0.5113849639892578s.
-[triton-dejavu] First execution including JIT compilation took 0.44628405570983887s.
-[triton-dejavu] First execution including JIT compilation took 0.730881929397583s.
-[triton-dejavu] First execution including JIT compilation took 0.538200855255127s.
-[triton-dejavu] First execution including JIT compilation took 0.45828986167907715s.
-[triton-dejavu] First execution including JIT compilation took 0.8166990280151367s.
-[triton-dejavu] First execution including JIT compilation took 0.5725693702697754s.
-[triton-dejavu] First execution including JIT compilation took 0.42383289337158203s.
-[triton-dejavu] First execution including JIT compilation took 0.7243595123291016s.
-[triton-dejavu] First execution including JIT compilation took 0.578228235244751s.
-[triton-dejavu] First execution including JIT compilation took 0.3952150344848633s.
-[triton-dejavu] First execution including JIT compilation took 0.7390725612640381s.
-[triton-dejavu] First execution including JIT compilation took 0.6426718235015869s.
-[triton-dejavu] First execution including JIT compilation took 0.4642622470855713s.
-[triton-dejavu] First execution including JIT compilation took 0.7362449169158936s.
-[triton-dejavu] First execution including JIT compilation took 0.4661426544189453s.
-[triton-dejavu] First execution including JIT compilation took 0.37535643577575684s.
-[triton-dejavu] First execution including JIT compilation took 0.962334156036377s.
-[triton-dejavu] First execution including JIT compilation took 0.7253522872924805s.
-[triton-dejavu] First execution including JIT compilation took 0.5154576301574707s.
-[triton-dejavu] First execution including JIT compilation took 1.1596920490264893s.
-[triton-dejavu] First execution including JIT compilation took 0.693903923034668s.
-[triton-dejavu] First execution including JIT compilation took 0.542579174041748s.
-[triton-dejavu] First execution including JIT compilation took 1.143411636352539s.
-[triton-dejavu] First execution including JIT compilation took 0.5730850696563721s.
-[triton-dejavu] First execution including JIT compilation took 0.45581793785095215s.
-[triton-dejavu] First execution including JIT compilation took 1.1371288299560547s.
-[triton-dejavu] First execution including JIT compilation took 0.6347072124481201s.
-[triton-dejavu] First execution including JIT compilation took 0.5362868309020996s.
-[triton-dejavu] First execution including JIT compilation took 1.1827235221862793s.
-[triton-dejavu] First execution including JIT compilation took 0.6695809364318848s.
-[triton-dejavu] First execution including JIT compilation took 0.594818115234375s.
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.4952049255371094s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.38374853134155273s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.34327268600463867s.
-[triton-dejavu] First execution including JIT compilation took 0.5115656852722168s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.41890788078308105s.
-[triton-dejavu] First execution including JIT compilation took 0.36414170265197754s.
-[triton-dejavu] First execution including JIT compilation took 0.551964282989502s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.43802404403686523s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3844606876373291s.
-[triton-dejavu] First execution including JIT compilation took 0.5653436183929443s.
-[triton-dejavu] First execution including JIT compilation took 0.4929649829864502s.
-[triton-dejavu] First execution including JIT compilation took 0.4082679748535156s.
-[triton-dejavu] First execution including JIT compilation took 0.6021175384521484s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.479703426361084s.
-[triton-dejavu] First execution including JIT compilation took 0.4264242649078369s.
-[triton-dejavu] First execution including JIT compilation took 0.625530481338501s.
-[triton-dejavu] First execution including JIT compilation took 0.5222625732421875s.
-[triton-dejavu] First execution including JIT compilation took 0.4469916820526123s.
-[triton-dejavu] First execution including JIT compilation took 0.6354126930236816s.
-[triton-dejavu] First execution including JIT compilation took 0.5203869342803955s.
-[triton-dejavu] First execution including JIT compilation took 0.46642088890075684s.
-[triton-dejavu] First execution including JIT compilation took 0.588517427444458s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.49373459815979004s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3735010623931885s.
-[triton-dejavu] First execution including JIT compilation took 0.5814676284790039s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4442164897918701s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3985586166381836s.
-[triton-dejavu] First execution including JIT compilation took 0.6262340545654297s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.47362422943115234s.
-[triton-dejavu] First execution including JIT compilation took 0.41541028022766113s.
-[triton-dejavu] First execution including JIT compilation took 0.8599624633789062s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6292743682861328s.
-[triton-dejavu] First execution including JIT compilation took 0.4856271743774414s.
-[triton-dejavu] First execution including JIT compilation took 0.6879935264587402s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5188641548156738s.
-[triton-dejavu] First execution including JIT compilation took 0.4481801986694336s.
-[triton-dejavu] First execution including JIT compilation took 0.682525634765625s.
-[triton-dejavu] First execution including JIT compilation took 0.603325605392456s.
-[triton-dejavu] First execution including JIT compilation took 0.45879626274108887s.
-[triton-dejavu] First execution including JIT compilation took 0.7078754901885986s.
-[triton-dejavu] First execution including JIT compilation took 0.5560562610626221s.
-[triton-dejavu] First execution including JIT compilation took 0.45784831047058105s.
-[triton-dejavu] First execution including JIT compilation took 0.67889404296875s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.45562100410461426s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.38816308975219727s.
-[triton-dejavu] First execution including JIT compilation took 0.6418576240539551s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.48399782180786133s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.41680216789245605s.
-[triton-dejavu] First execution including JIT compilation took 0.7157330513000488s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5226426124572754s.
-[triton-dejavu] First execution including JIT compilation took 0.44080543518066406s.
-[triton-dejavu] First execution including JIT compilation took 0.7858779430389404s.
-[triton-dejavu] First execution including JIT compilation took 0.5671470165252686s.
-[triton-dejavu] First execution including JIT compilation took 0.45592260360717773s.
-[triton-dejavu] First execution including JIT compilation took 0.8100578784942627s.
-[triton-dejavu] First execution including JIT compilation took 0.6213173866271973s.
-[triton-dejavu] First execution including JIT compilation took 0.47237181663513184s.
-[triton-dejavu] First execution including JIT compilation took 0.7891368865966797s.
-[triton-dejavu] First execution including JIT compilation took 0.6662912368774414s.
-[triton-dejavu] First execution including JIT compilation took 0.4879744052886963s.
-[triton-dejavu] First execution including JIT compilation took 0.731757640838623s.
-[triton-dejavu] First execution including JIT compilation took 0.4918680191040039s.
-[triton-dejavu] First execution including JIT compilation took 0.37989187240600586s.
-[triton-dejavu] First execution including JIT compilation took 0.6664383411407471s.
-[triton-dejavu] First execution including JIT compilation took 0.4817678928375244s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3559696674346924s.
-[triton-dejavu] First execution including JIT compilation took 0.8642761707305908s.
-[triton-dejavu] First execution including JIT compilation took 0.5662164688110352s.
-[triton-dejavu] First execution including JIT compilation took 0.45751142501831055s.
-[triton-dejavu] First execution including JIT compilation took 0.9735383987426758s.
-[triton-dejavu] First execution including JIT compilation took 0.6600606441497803s.
-[triton-dejavu] First execution including JIT compilation took 0.48941469192504883s.
-[triton-dejavu] First execution including JIT compilation took 1.0599989891052246s.
-[triton-dejavu] First execution including JIT compilation took 0.5858447551727295s.
-[triton-dejavu] First execution including JIT compilation took 0.40030384063720703s.
-[triton-dejavu] First execution including JIT compilation took 0.9032082557678223s.
-[triton-dejavu] First execution including JIT compilation took 0.5963606834411621s.
-[triton-dejavu] First execution including JIT compilation took 0.5698938369750977s.
-[triton-dejavu] First execution including JIT compilation took 0.9204597473144531s.
-[triton-dejavu] First execution including JIT compilation took 0.7513656616210938s.
-[triton-dejavu] First execution including JIT compilation took 0.5392777919769287s.
-[triton-dejavu] First execution including JIT compilation took 1.3184521198272705s.
-[triton-dejavu] First execution including JIT compilation took 0.7888948917388916s.
-[triton-dejavu] First execution including JIT compilation took 0.6177425384521484s.
-[triton-dejavu] First execution including JIT compilation took 1.1905827522277832s.
-[triton-dejavu] First execution including JIT compilation took 0.7364373207092285s.
-[triton-dejavu] First execution including JIT compilation took 0.5242094993591309s.
-[triton-dejavu] First execution including JIT compilation took 1.2864527702331543s.
-[triton-dejavu] First execution including JIT compilation took 0.8166484832763672s.
-[triton-dejavu] First execution including JIT compilation took 0.5594861507415771s.
-[triton-dejavu] First execution including JIT compilation took 1.8655834197998047s.
-[triton-dejavu] First execution including JIT compilation took 0.9145352840423584s.
-[triton-dejavu] First execution including JIT compilation took 0.6113896369934082s.
-[triton-dejavu] First execution including JIT compilation took 1.9301745891571045s.
-[triton-dejavu] First execution including JIT compilation took 1.0628697872161865s.
-[triton-dejavu] First execution including JIT compilation took 0.64133620262146s.
-[triton-dejavu] First execution including JIT compilation took 2.2749366760253906s.
-[triton-dejavu] First execution including JIT compilation took 1.0524189472198486s.
-[triton-dejavu] First execution including JIT compilation took 0.677316427230835s.
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.6699347496032715s.
-[triton-dejavu] First execution including JIT compilation took 0.39471912384033203s.
-[triton-dejavu] First execution including JIT compilation took 0.31299400329589844s.
-[triton-dejavu] First execution including JIT compilation took 0.9980213642120361s.
-[triton-dejavu] First execution including JIT compilation took 0.4564340114593506s.
-[triton-dejavu] First execution including JIT compilation took 0.39405226707458496s.
-[triton-dejavu] First execution including JIT compilation took 0.721914529800415s.
-[triton-dejavu] First execution including JIT compilation took 0.5424695014953613s.
-[triton-dejavu] First execution including JIT compilation took 0.41809797286987305s.
-[triton-dejavu] First execution including JIT compilation took 0.7378096580505371s.
-[triton-dejavu] First execution including JIT compilation took 0.538069486618042s.
-[triton-dejavu] First execution including JIT compilation took 0.43320608139038086s.
-[triton-dejavu] First execution including JIT compilation took 0.8680074214935303s.
-[triton-dejavu] First execution including JIT compilation took 0.5815584659576416s.
-[triton-dejavu] First execution including JIT compilation took 0.44110822677612305s.
-[triton-dejavu] First execution including JIT compilation took 0.797199010848999s.
-[triton-dejavu] First execution including JIT compilation took 0.7567603588104248s.
-[triton-dejavu] First execution including JIT compilation took 0.47153782844543457s.
-[triton-dejavu] First execution including JIT compilation took 0.8809914588928223s.
-[triton-dejavu] First execution including JIT compilation took 0.6448085308074951s.
-[triton-dejavu] First execution including JIT compilation took 0.5167965888977051s.
-[triton-dejavu] First execution including JIT compilation took 0.745863676071167s.
-[triton-dejavu] First execution including JIT compilation took 0.5225260257720947s.
-[triton-dejavu] First execution including JIT compilation took 0.4189014434814453s.
-[triton-dejavu] First execution including JIT compilation took 0.7760834693908691s.
-[triton-dejavu] First execution including JIT compilation took 0.5539810657501221s.
-[triton-dejavu] First execution including JIT compilation took 0.44478821754455566s.
-[triton-dejavu] First execution including JIT compilation took 0.8012809753417969s.
-[triton-dejavu] First execution including JIT compilation took 0.6483604907989502s.
-[triton-dejavu] First execution including JIT compilation took 0.4678480625152588s.
-[triton-dejavu] First execution including JIT compilation took 0.8454635143280029s.
-[triton-dejavu] First execution including JIT compilation took 0.6168031692504883s.
-[triton-dejavu] First execution including JIT compilation took 0.6612381935119629s.
-[triton-dejavu] First execution including JIT compilation took 0.8955931663513184s.
-[triton-dejavu] First execution including JIT compilation took 0.6233341693878174s.
-[triton-dejavu] First execution including JIT compilation took 0.4918363094329834s.
-[triton-dejavu] First execution including JIT compilation took 0.9699711799621582s.
-[triton-dejavu] First execution including JIT compilation took 0.6717431545257568s.
-[triton-dejavu] First execution including JIT compilation took 0.5321164131164551s.
-[triton-dejavu] First execution including JIT compilation took 1.006484031677246s.
-[triton-dejavu] First execution including JIT compilation took 0.7167990207672119s.
-[triton-dejavu] First execution including JIT compilation took 0.5426886081695557s.
-[triton-dejavu] First execution including JIT compilation took 0.8549697399139404s.
-[triton-dejavu] First execution including JIT compilation took 0.5004158020019531s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42880892753601074s.
-[triton-dejavu] First execution including JIT compilation took 0.8870432376861572s.
-[triton-dejavu] First execution including JIT compilation took 0.5380675792694092s.
-[triton-dejavu] First execution including JIT compilation took 0.44513392448425293s.
-[triton-dejavu] First execution including JIT compilation took 1.0106170177459717s.
-[triton-dejavu] First execution including JIT compilation took 0.6438839435577393s.
-[triton-dejavu] First execution including JIT compilation took 0.48810815811157227s.
-[triton-dejavu] First execution including JIT compilation took 1.1047391891479492s.
-[triton-dejavu] First execution including JIT compilation took 0.6829500198364258s.
-[triton-dejavu] First execution including JIT compilation took 0.5343265533447266s.
-[triton-dejavu] First execution including JIT compilation took 1.1722900867462158s.
-[triton-dejavu] First execution including JIT compilation took 0.7511520385742188s.
-[triton-dejavu] First execution including JIT compilation took 0.5391092300415039s.
-[triton-dejavu] First execution including JIT compilation took 1.2446460723876953s.
-[triton-dejavu] First execution including JIT compilation took 0.7718749046325684s.
-[triton-dejavu] First execution including JIT compilation took 0.549095630645752s.
-[triton-dejavu] First execution including JIT compilation took 1.3546397686004639s.
-[triton-dejavu] First execution including JIT compilation took 0.7892618179321289s.
-[triton-dejavu] First execution including JIT compilation took 0.46314549446105957s.
-[triton-dejavu] First execution including JIT compilation took 0.9860119819641113s.
-[triton-dejavu] First execution including JIT compilation took 0.8724544048309326s.
-[triton-dejavu] First execution including JIT compilation took 0.4373140335083008s.
-[triton-dejavu] First execution including JIT compilation took 1.0243175029754639s.
-[triton-dejavu] First execution including JIT compilation took 0.6186015605926514s.
-[triton-dejavu] First execution including JIT compilation took 0.4280831813812256s.
-[triton-dejavu] First execution including JIT compilation took 1.5726463794708252s.
-[triton-dejavu] First execution including JIT compilation took 0.9008209705352783s.
-[triton-dejavu] First execution including JIT compilation took 0.44704723358154297s.
-[triton-dejavu] First execution including JIT compilation took 1.6724953651428223s.
-[triton-dejavu] First execution including JIT compilation took 0.8446671962738037s.
-[triton-dejavu] First execution including JIT compilation took 0.4729273319244385s.
-[triton-dejavu] First execution including JIT compilation took 1.7400548458099365s.
-[triton-dejavu] First execution including JIT compilation took 0.8373644351959229s.
-[triton-dejavu] First execution including JIT compilation took 0.5513191223144531s.
-[triton-dejavu] First execution including JIT compilation took 1.8062167167663574s.
-[triton-dejavu] First execution including JIT compilation took 0.8943934440612793s.
-[triton-dejavu] First execution including JIT compilation took 0.5011510848999023s.
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.683664321899414s.
-[triton-dejavu] First execution including JIT compilation took 0.8943469524383545s.
-[triton-dejavu] First execution including JIT compilation took 0.5135440826416016s.
-[triton-dejavu] First execution including JIT compilation took 1.9431862831115723s.
-[triton-dejavu] First execution including JIT compilation took 1.2609891891479492s.
-[triton-dejavu] First execution including JIT compilation took 0.8276565074920654s.
-[triton-dejavu] First execution including JIT compilation took 6.717592477798462s.
-[triton-dejavu] First execution including JIT compilation took 1.4402704238891602s.
-[triton-dejavu] First execution including JIT compilation took 0.6466906070709229s.
-bench_cudagraph failed with out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.9890782833099365s.
-[triton-dejavu] First execution including JIT compilation took 0.6056652069091797s.
-[triton-dejavu] First execution including JIT compilation took 0.37941598892211914s.
-[triton-dejavu] First execution including JIT compilation took 0.9751529693603516s.
-[triton-dejavu] First execution including JIT compilation took 0.5807168483734131s.
-[triton-dejavu] First execution including JIT compilation took 0.46853089332580566s.
-[triton-dejavu] First execution including JIT compilation took 1.0114789009094238s.
-[triton-dejavu] First execution including JIT compilation took 0.6656544208526611s.
-[triton-dejavu] First execution including JIT compilation took 0.4591987133026123s.
-[triton-dejavu] First execution including JIT compilation took 1.2622675895690918s.
-[triton-dejavu] First execution including JIT compilation took 0.7641468048095703s.
-[triton-dejavu] First execution including JIT compilation took 0.5717291831970215s.
-[triton-dejavu] First execution including JIT compilation took 1.4783613681793213s.
-[triton-dejavu] First execution including JIT compilation took 0.674863338470459s.
-[triton-dejavu] First execution including JIT compilation took 0.4595465660095215s.
-[triton-dejavu] First execution including JIT compilation took 1.3642892837524414s.
-[triton-dejavu] First execution including JIT compilation took 0.6485683917999268s.
-[triton-dejavu] First execution including JIT compilation took 0.48433613777160645s.
-[triton-dejavu] First execution including JIT compilation took 1.2521538734436035s.
-[triton-dejavu] First execution including JIT compilation took 0.6855838298797607s.
-[triton-dejavu] First execution including JIT compilation took 0.5421812534332275s.
-[triton-dejavu] First execution including JIT compilation took 1.2270712852478027s.
-[triton-dejavu] First execution including JIT compilation took 0.5855932235717773s.
-[triton-dejavu] First execution including JIT compilation took 0.3942844867706299s.
-[triton-dejavu] First execution including JIT compilation took 1.0544099807739258s.
-[triton-dejavu] First execution including JIT compilation took 0.6344761848449707s.
-[triton-dejavu] First execution including JIT compilation took 0.4506490230560303s.
-[triton-dejavu] First execution including JIT compilation took 1.6647655963897705s.
-[triton-dejavu] First execution including JIT compilation took 0.7378494739532471s.
-[triton-dejavu] First execution including JIT compilation took 0.4730367660522461s.
-[triton-dejavu] First execution including JIT compilation took 1.290454387664795s.
-[triton-dejavu] First execution including JIT compilation took 0.75484299659729s.
-[triton-dejavu] First execution including JIT compilation took 0.483842134475708s.
-[triton-dejavu] First execution including JIT compilation took 1.2890782356262207s.
-[triton-dejavu] First execution including JIT compilation took 0.7153482437133789s.
-[triton-dejavu] First execution including JIT compilation took 0.5798866748809814s.
-[triton-dejavu] First execution including JIT compilation took 1.450512170791626s.
-[triton-dejavu] First execution including JIT compilation took 0.7689251899719238s.
-[triton-dejavu] First execution including JIT compilation took 0.5422661304473877s.
-[triton-dejavu] First execution including JIT compilation took 1.3612980842590332s.
-[triton-dejavu] First execution including JIT compilation took 0.7846195697784424s.
-[triton-dejavu] First execution including JIT compilation took 0.6903204917907715s.
-[triton-dejavu] First execution including JIT compilation took 1.3367185592651367s.
-[triton-dejavu] First execution including JIT compilation took 0.7425005435943604s.
-[triton-dejavu] First execution including JIT compilation took 0.5718681812286377s.
-[triton-dejavu] First execution including JIT compilation took 1.687856674194336s.
-[triton-dejavu] First execution including JIT compilation took 0.6989390850067139s.
-[triton-dejavu] First execution including JIT compilation took 0.45311927795410156s.
-[triton-dejavu] First execution including JIT compilation took 1.8834350109100342s.
-[triton-dejavu] First execution including JIT compilation took 0.7894504070281982s.
-[triton-dejavu] First execution including JIT compilation took 0.5649204254150391s.
-[triton-dejavu] First execution including JIT compilation took 2.243360757827759s.
-[triton-dejavu] First execution including JIT compilation took 0.8881011009216309s.
-[triton-dejavu] First execution including JIT compilation took 0.6061875820159912s.
-[triton-dejavu] First execution including JIT compilation took 2.1226956844329834s.
-[triton-dejavu] First execution including JIT compilation took 0.8810961246490479s.
-[triton-dejavu] First execution including JIT compilation took 0.665024995803833s.
-[triton-dejavu] First execution including JIT compilation took 2.3865857124328613s.
-[triton-dejavu] First execution including JIT compilation took 0.9415686130523682s.
-[triton-dejavu] First execution including JIT compilation took 0.6442673206329346s.
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.141575574874878s.
-[triton-dejavu] First execution including JIT compilation took 0.9174015522003174s.
-[triton-dejavu] First execution including JIT compilation took 0.5408012866973877s.
-[triton-dejavu] First execution including JIT compilation took 2.082519292831421s.
-[triton-dejavu] First execution including JIT compilation took 1.0053105354309082s.
-[triton-dejavu] First execution including JIT compilation took 0.5957515239715576s.
-[triton-dejavu] First execution including JIT compilation took 6.80155086517334s.
-[triton-dejavu] First execution including JIT compilation took 1.5097930431365967s.
-[triton-dejavu] First execution including JIT compilation took 0.9143364429473877s.
-[triton-dejavu] First execution including JIT compilation took 6.931457042694092s.
-[triton-dejavu] First execution including JIT compilation took 1.593360185623169s.
-[triton-dejavu] First execution including JIT compilation took 0.7019162178039551s.
-bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 377856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.745112180709839s.
-[triton-dejavu] First execution including JIT compilation took 1.5900769233703613s.
-[triton-dejavu] First execution including JIT compilation took 0.7577598094940186s.
-[triton-dejavu] First execution including JIT compilation took 4.816965103149414s.
-[triton-dejavu] First execution including JIT compilation took 1.6154999732971191s.
-[triton-dejavu] First execution including JIT compilation took 0.8251774311065674s.
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 456704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 606208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 755712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.4465477466583252s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3604874610900879s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3589968681335449s.
-[triton-dejavu] First execution including JIT compilation took 0.4949653148651123s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4129481315612793s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.37282252311706543s.
-[triton-dejavu] First execution including JIT compilation took 0.49967503547668457s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4030342102050781s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.38671374320983887s.
-[triton-dejavu] First execution including JIT compilation took 0.6607780456542969s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5266311168670654s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.593902587890625s.
-[triton-dejavu] First execution including JIT compilation took 0.5250449180603027s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4251673221588135s.
-[triton-dejavu] First execution including JIT compilation took 0.41414403915405273s.
-[triton-dejavu] First execution including JIT compilation took 0.5248408317565918s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.46840643882751465s.
-[triton-dejavu] First execution including JIT compilation took 0.46053147315979004s.
-[triton-dejavu] First execution including JIT compilation took 0.5807092189788818s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4862945079803467s.
-[triton-dejavu] First execution including JIT compilation took 0.4458580017089844s.
-[triton-dejavu] First execution including JIT compilation took 0.729177713394165s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.40300631523132324s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3684837818145752s.
-[triton-dejavu] First execution including JIT compilation took 0.5131044387817383s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4550745487213135s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.39257073402404785s.
-[triton-dejavu] First execution including JIT compilation took 0.5535814762115479s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4340968132019043s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4005141258239746s.
-[triton-dejavu] First execution including JIT compilation took 0.737922191619873s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5746960639953613s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.397491455078125s.
-[triton-dejavu] First execution including JIT compilation took 0.5818440914154053s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6529510021209717s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4184551239013672s.
-[triton-dejavu] First execution including JIT compilation took 0.6207692623138428s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4892761707305908s.
-[triton-dejavu] First execution including JIT compilation took 0.44497179985046387s.
-[triton-dejavu] First execution including JIT compilation took 0.635669469833374s.
-[triton-dejavu] First execution including JIT compilation took 0.5256602764129639s.
-[triton-dejavu] First execution including JIT compilation took 0.48749327659606934s.
-[triton-dejavu] First execution including JIT compilation took 0.5720036029815674s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42984604835510254s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.38751769065856934s.
-[triton-dejavu] First execution including JIT compilation took 0.6071627140045166s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4755427837371826s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3990035057067871s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.61566162109375s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4701671600341797s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4040553569793701s.
-[triton-dejavu] First execution including JIT compilation took 0.6511368751525879s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5009520053863525s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4353518486022949s.
-[triton-dejavu] First execution including JIT compilation took 0.6790196895599365s.
-[triton-dejavu] First execution including JIT compilation took 0.5248758792877197s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4504244327545166s.
-[triton-dejavu] First execution including JIT compilation took 0.7124292850494385s.
-[triton-dejavu] First execution including JIT compilation took 0.8151717185974121s.
-[triton-dejavu] First execution including JIT compilation took 0.4823343753814697s.
-[triton-dejavu] First execution including JIT compilation took 0.772719144821167s.
-[triton-dejavu] First execution including JIT compilation took 0.6169350147247314s.
-[triton-dejavu] First execution including JIT compilation took 0.5196678638458252s.
-[triton-dejavu] First execution including JIT compilation took 0.7186233997344971s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4982566833496094s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4104623794555664s.
-[triton-dejavu] First execution including JIT compilation took 1.1141915321350098s.
-[triton-dejavu] First execution including JIT compilation took 0.5356433391571045s.
-[triton-dejavu] First execution including JIT compilation took 0.4406569004058838s.
-[triton-dejavu] First execution including JIT compilation took 0.8371496200561523s.
-[triton-dejavu] First execution including JIT compilation took 0.5642838478088379s.
-[triton-dejavu] First execution including JIT compilation took 0.4726717472076416s.
-[triton-dejavu] First execution including JIT compilation took 0.93656325340271s.
-[triton-dejavu] First execution including JIT compilation took 0.6194779872894287s.
-[triton-dejavu] First execution including JIT compilation took 0.4953165054321289s.
-[triton-dejavu] First execution including JIT compilation took 0.9690747261047363s.
-[triton-dejavu] First execution including JIT compilation took 0.6501588821411133s.
-[triton-dejavu] First execution including JIT compilation took 0.5288493633270264s.
-[triton-dejavu] First execution including JIT compilation took 1.2569010257720947s.
-[triton-dejavu] First execution including JIT compilation took 0.6968162059783936s.
-[triton-dejavu] First execution including JIT compilation took 0.5399911403656006s.
-[triton-dejavu] First execution including JIT compilation took 1.2143075466156006s.
-[triton-dejavu] First execution including JIT compilation took 0.733314037322998s.
-[triton-dejavu] First execution including JIT compilation took 0.6001999378204346s.
-[triton-dejavu] First execution including JIT compilation took 1.1585540771484375s.
-[triton-dejavu] First execution including JIT compilation took 0.7061564922332764s.
-[triton-dejavu] First execution including JIT compilation took 0.509422779083252s.
-[triton-dejavu] First execution including JIT compilation took 1.1820228099822998s.
-[triton-dejavu] First execution including JIT compilation took 0.7445404529571533s.
-[triton-dejavu] First execution including JIT compilation took 0.4977116584777832s.
-[triton-dejavu] First execution including JIT compilation took 1.091106653213501s.
-[triton-dejavu] First execution including JIT compilation took 0.7330291271209717s.
-[triton-dejavu] First execution including JIT compilation took 0.5066168308258057s.
-[triton-dejavu] First execution including JIT compilation took 1.6019270420074463s.
-[triton-dejavu] First execution including JIT compilation took 0.9132928848266602s.
-[triton-dejavu] First execution including JIT compilation took 0.6490397453308105s.
-[triton-dejavu] First execution including JIT compilation took 1.566523790359497s.
-[triton-dejavu] First execution including JIT compilation took 0.9093658924102783s.
-[triton-dejavu] First execution including JIT compilation took 0.5752038955688477s.
-[triton-dejavu] First execution including JIT compilation took 1.3499906063079834s.
-[triton-dejavu] First execution including JIT compilation took 0.7472167015075684s.
-[triton-dejavu] First execution including JIT compilation took 0.5233032703399658s.
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.4235203266143799s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.367872953414917s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3045461177825928s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4933462142944336s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5991702079772949s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3020298480987549s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.43563389778137207s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3850095272064209s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.40093398094177246s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6140017509460449s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.48171281814575195s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5723874568939209s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5938704013824463s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7248561382293701s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5335805416107178s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.632122278213501s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5055139064788818s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.45116615295410156s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6221024990081787s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5203642845153809s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4454641342163086s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5253396034240723s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.413818359375s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.37888503074645996s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5810997486114502s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6574397087097168s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.39346885681152344s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.62447190284729s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.478407621383667s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4812755584716797s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6491189002990723s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5157642364501953s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42938828468322754s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.680762529373169s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5025956630706787s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.43789172172546387s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.662214994430542s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5385723114013672s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42354893684387207s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7007582187652588s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5846168994903564s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.46515345573425293s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9489858150482178s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4519827365875244s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3820490837097168s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6905148029327393s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4977149963378906s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.41359424591064453s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7216389179229736s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5443611145019531s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.44051408767700195s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7794044017791748s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.562237024307251s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5855629444122314s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8041880130767822s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5811121463775635s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4584963321685791s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.0672459602355957s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.586475133895874s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.49823427200317383s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8634672164916992s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6550483703613281s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5157911777496338s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8354160785675049s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5146996974945068s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4071619510650635s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9271225929260254s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.580986738204956s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4576835632324219s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.111537218093872s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6412017345428467s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.49378371238708496s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.0859580039978027s.
-[triton-dejavu] First execution including JIT compilation took 0.6884167194366455s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5569989681243896s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.1242315769195557s.
-[triton-dejavu] First execution including JIT compilation took 0.7313017845153809s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5240278244018555s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.2283635139465332s.
-[triton-dejavu] First execution including JIT compilation took 0.7234997749328613s.
-[triton-dejavu] First execution including JIT compilation took 0.5484554767608643s.
-[triton-dejavu] First execution including JIT compilation took 1.4758551120758057s.
-[triton-dejavu] First execution including JIT compilation took 0.6073637008666992s.
-[triton-dejavu] First execution including JIT compilation took 0.513019323348999s.
-[triton-dejavu] First execution including JIT compilation took 1.0995526313781738s.
-[triton-dejavu] First execution including JIT compilation took 0.6934101581573486s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4188697338104248s.
-[triton-dejavu] First execution including JIT compilation took 1.2958614826202393s.
-[triton-dejavu] First execution including JIT compilation took 0.8936920166015625s.
-[triton-dejavu] First execution including JIT compilation took 0.6156136989593506s.
-[triton-dejavu] First execution including JIT compilation took 1.6940598487854004s.
-[triton-dejavu] First execution including JIT compilation took 0.9992377758026123s.
-[triton-dejavu] First execution including JIT compilation took 0.6807270050048828s.
-[triton-dejavu] First execution including JIT compilation took 1.7915706634521484s.
-[triton-dejavu] First execution including JIT compilation took 0.8958044052124023s.
-[triton-dejavu] First execution including JIT compilation took 0.6192543506622314s.
-[triton-dejavu] First execution including JIT compilation took 1.4820353984832764s.
-[triton-dejavu] First execution including JIT compilation took 0.9619314670562744s.
-[triton-dejavu] First execution including JIT compilation took 0.6009480953216553s.
-bench_cudagraph failed with out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 272384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 374784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.5543904304504395s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.41063737869262695s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.30236029624938965s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5460262298583984s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.50726318359375s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42197704315185547s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.686154842376709s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5402421951293945s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.44283580780029297s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7077608108520508s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5410175323486328s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.47356486320495605s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6953809261322021s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7652671337127686s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.45247840881347656s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7314624786376953s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5340700149536133s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.47040677070617676s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7279396057128906s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5927844047546387s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5247375965118408s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6629395484924316s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.49787163734436035s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.39687013626098633s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7211334705352783s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5556731224060059s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6893763542175293s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7504291534423828s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5790531635284424s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.47012805938720703s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6832501888275146s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4917154312133789s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.35793185234069824s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.0028676986694335938s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4932436943054199s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3936727046966553s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6478121280670166s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5344371795654297s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3720393180847168s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6856989860534668s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.49400806427001953s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.40635251998901367s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6419022083282471s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42707204818725586s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.37799835205078125s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7089602947235107s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6736738681793213s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3756542205810547s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.805124044418335s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5075352191925049s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3723928928375244s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7105093002319336s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5702188014984131s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.374800443649292s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8257863521575928s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.562946081161499s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42319226264953613s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7553699016571045s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5543286800384521s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4214789867401123s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7838122844696045s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5473670959472656s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.45372581481933594s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7879917621612549s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4837973117828369s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.39473915100097656s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8744144439697266s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6230897903442383s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5225625038146973s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.4133057594299316s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.0481688976287842s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.756004810333252s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.3453631401062012s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6478581428527832s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.45032429695129395s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.1815299987792969s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6545298099517822s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7831099033355713s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.3497538566589355s.
-[triton-dejavu] First execution including JIT compilation took 0.6530613899230957s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.46605396270751953s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.367077350616455s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7068085670471191s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.44535279273986816s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.4023311138153076s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7958266735076904s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.507124662399292s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 2.0259878635406494s.
-[triton-dejavu] First execution including JIT compilation took 0.9261250495910645s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5468614101409912s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 2.3781609535217285s.
-[triton-dejavu] First execution including JIT compilation took 1.0091207027435303s.
-[triton-dejavu] First execution including JIT compilation took 0.6951940059661865s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 286720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.7430386543273926s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5558526515960693s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4275531768798828s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8700566291809082s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5754470825195312s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.448347806930542s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.90216064453125s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6065723896026611s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.47638845443725586s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9279463291168213s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7985544204711914s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4977383613586426s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9453699588775635s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4945030212402344s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3977932929992676s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7945261001586914s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.549720287322998s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42061400413513184s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9728484153747559s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5585596561431885s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4591073989868164s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7104089260101318s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4560739994049072s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3498101234436035s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8379316329956055s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5130932331085205s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.36475372314453125s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8153905868530273s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5603029727935791s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.41185498237609863s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8476624488830566s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5228164196014404s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4401214122772217s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8370048999786377s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5649154186248779s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.41320371627807617s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9297363758087158s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7755241394042969s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5734150409698486s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.232421636581421s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8419132232666016s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6011636257171631s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.1239733695983887s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6979858875274658s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.487072229385376s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.2030727863311768s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7380573749542236s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5238943099975586s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.343810796737671s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7683749198913574s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5486259460449219s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.3422448635101318s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7824568748474121s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5656516551971436s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.4012060165405273s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.824357271194458s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6031546592712402s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.4797933101654053s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9887309074401855s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6871368885040283s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.6989936828613281s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9585423469543457s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.692669153213501s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.65909743309021s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6878805160522461s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4942758083343506s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.342179775238037s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7795774936676025s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4817812442779541s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.9305896759033203s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8872191905975342s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4981191158294678s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 2.046060800552368s.
-[triton-dejavu] First execution including JIT compilation took 0.9139325618743896s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5242531299591064s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 2.2648186683654785s.
-[triton-dejavu] First execution including JIT compilation took 0.8834800720214844s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7166852951049805s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 259072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 359424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.2592248916625977s.
-[triton-dejavu] First execution including JIT compilation took 1.0523405075073242s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.599764347076416s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 2.521293878555298s.
-[triton-dejavu] First execution including JIT compilation took 1.171839952468872s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6534533500671387s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 6.292866468429565s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.8475072383880615s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7422606945037842s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 518144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 718848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.093348503112793s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6464064121246338s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4449312686920166s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.3426687717437744s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.91444993019104s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5213837623596191s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.3889944553375244s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7329103946685791s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6597933769226074s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.227839469909668s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7042691707611084s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6446876525878906s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.8344454765319824s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9738888740539551s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.672421932220459s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.6915192604064941s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.001119613647461s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7162504196166992s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.8200452327728271s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.0510845184326172s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7335896492004395s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.6177794933319092s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9454030990600586s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5991966724395752s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.6675848960876465s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9363722801208496s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6759653091430664s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.862114667892456s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.0182960033416748s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6502413749694824s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.9105088710784912s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9901387691497803s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6667122840881348s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.7807495594024658s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8490705490112305s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5338022708892822s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.4980332851409912s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.87496018409729s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5721733570098877s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.6702356338500977s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9042339324951172s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6040554046630859s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.504411220550537s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7958929538726807s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5112464427947998s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.6463310718536377s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9492459297180176s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5592634677886963s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 2.21022367477417s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9613430500030518s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5633087158203125s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 2.2821779251098633s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.097722053527832s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6317684650421143s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 3.0794928073883057s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.2995553016662598s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8081183433532715s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 3.323143243789673s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.379629373550415s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7605845928192139s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.796154260635376s.
-[triton-dejavu] First execution including JIT compilation took 1.4310317039489746s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7960169315338135s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 3.4028375148773193s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.6688313484191895s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9481635093688965s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 8.42493486404419s.
-[triton-dejavu] First execution including JIT compilation took 1.7116987705230713s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7699902057647705s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 6.023736238479614s.
-[triton-dejavu] First execution including JIT compilation took 2.0444483757019043s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.062030553817749s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 6.113872051239014s.
-[triton-dejavu] First execution including JIT compilation took 2.0313453674316406s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 1.0472145080566406s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1177600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.48328304290771484s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3348958492279053s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.30452704429626465s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5489785671234131s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.33617687225341797s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5939881801605225s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5159595012664795s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.35559558868408203s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3347048759460449s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5674691200256348s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.43275880813598633s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3386096954345703s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5426011085510254s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.39165472984313965s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.35681700706481934s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5494749546051025s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.42157554626464844s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.345064640045166s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5533592700958252s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5270988941192627s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6511619091033936s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5268349647521973s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.611098051071167s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.2967538833618164s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5285255908966064s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3548440933227539s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.31319618225097656s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5639190673828125s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.39958620071411133s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.3334677219390869s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6589338779449463s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4241213798522949s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.6113383769989014s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8450291156768799s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5650513172149658s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.4908101558685303s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.7597527503967285s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5363900661468506s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.48531675338745117s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.9195475578308105s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5560624599456787s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.5028431415557861s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.8041844367980957s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.49500370025634766s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-[triton-dejavu] First execution including JIT compilation took 0.37749719619750977s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 0.5852396488189697s.
-bench_cudagraph failed with CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 376, in _do_bench_cudagraph
-    with torch.cuda.graph(g):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 186, in __exit__
-    self.cuda_graph.capture_end()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/cuda/graphs.py", line 84, in capture_end
-    super().capture_end()
-RuntimeError: CUDA error: out of memory
-CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
-For debugging consider passing CUDA_LAUNCH_BLOCKING=1
-Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
-
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 0.8381209373474121s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5893561840057373s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3753020763397217s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7169778347015381s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4236640930175781s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3638887405395508s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7483389377593994s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5384445190429688s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.658228874206543s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.886188268661499s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4752342700958252s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7237246036529541s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9241292476654053s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5497918128967285s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4125850200653076s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3035178184509277s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6299910545349121s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4645383358001709s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.442227840423584s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6872811317443848s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5252459049224854s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6195275783538818s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9957168102264404s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5770971775054932s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6193320751190186s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.776587724685669s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5602409839630127s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.470637559890747s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6102380752563477s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.44856834411621094s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.342865228652954s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7176856994628906s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4709174633026123s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 285696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 285696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 285696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 285696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.9359774589538574s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8123137950897217s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4981215000152588s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.2556896209716797s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0426855087280273s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5906248092651367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.9886162281036377s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 246.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2194347381591797s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6244046688079834s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 260096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 344064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 571392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 571392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 571392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 571392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.48250651359558105s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.31485867500305176s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3440537452697754s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8460357189178467s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.36809873580932617s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3085494041442871s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5444772243499756s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38303327560424805s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.34803223609924316s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5283372402191162s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3868238925933838s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.35518574714660645s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5908901691436768s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.41735363006591797s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6766963005065918s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5999925136566162s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.41122961044311523s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3416872024536133s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5752973556518555s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3956427574157715s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3643150329589844s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5298521518707275s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3585391044616699s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3086113929748535s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5575377941131592s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3903212547302246s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3265855312347412s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5826382637023926s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.43185901641845703s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38982224464416504s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7225501537322998s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5456938743591309s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.49631500244140625s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.004322052001953125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4188666343688965s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5133178234100342s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6560304164886475s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.43018031120300293s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4307105541229248s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7024564743041992s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5363326072692871s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.39928627014160156s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6894314289093018s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3780636787414551s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3666982650756836s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8828177452087402s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.47858190536499023s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3928706645965576s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8138139247894287s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.439422607421875s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38376379013061523s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8122892379760742s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.48032069206237793s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.45058488845825195s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.86492919921875s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4958674907684326s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4237086772918701s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8849928379058838s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.486788272857666s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4373905658721924s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0987954139709473s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8294305801391602s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4790806770324707s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1772897243499756s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.521981954574585s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.40645861625671387s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.16839599609375s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5616059303283691s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.42472362518310547s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 244.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.35 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3248302936553955s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6032726764678955s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4481019973754883s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.395122766494751s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7378954887390137s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5323140621185303s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7057254314422607s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6767878532409668s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5054512023925781s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7137601375579834s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 234496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 234496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 234496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 234496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 310272, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 318464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 318464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 318464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 318464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.6594467163085938s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3652985095977783s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5937278270721436s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.922431230545044s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2858715057373047s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5902688503265381s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.350640296936035s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5020318031311035s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6842968463897705s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 284672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 368640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 385024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 385024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 385024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 385024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 468992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 468992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 468992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 468992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 620544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 620544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 636928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 636928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 636928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 636928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.7688605785369873s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3665287494659424s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3449244499206543s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6777553558349609s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4027137756347656s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38666725158691406s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7940988540649414s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4141719341278076s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.37494373321533203s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7313904762268066s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.42134833335876465s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3692958354949951s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7075221538543701s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.43701624870300293s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.42354393005371094s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8378398418426514s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4932551383972168s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4718587398529053s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8113245964050293s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4463827610015869s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.428286075592041s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 242.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6905107498168945s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4112887382507324s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.33467864990234375s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7086637020111084s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.45603132247924805s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7833783626556396s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9827532768249512s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5891172885894775s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.49228405952453613s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.0715341567993164s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6479880809783936s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5540139675140381s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0900757312774658s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8101351261138916s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.526641845703125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1183600425720215s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6459150314331055s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5361812114715576s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.203599214553833s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6816227436065674s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5723059177398682s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2995352745056152s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.441802978515625s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.36841726303100586s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1730992794036865s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6147010326385498s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.48934268951416016s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2903673648834229s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5355379581451416s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.42858266830444336s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2931723594665527s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5596542358398438s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4497413635253906s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2131459712982178s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5678653717041016s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4486379623413086s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3713254928588867s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5958590507507324s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5505542755126953s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4308266639709473s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7147002220153809s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5467684268951416s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3634920120239258s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5880486965179443s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5700411796569824s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4810731410980225s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6438288688659668s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4656994342803955s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6747398376464844s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7002549171447754s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5245516300201416s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 240.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.000950813293457s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9302749633789062s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.604921817779541s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.449615478515625s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 267264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 267264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 283648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 283648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 283648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 367616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 367616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 384000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 3.207704782485962s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2239928245544434s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6375505924224854s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.287391424179077s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2227861881256104s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6965057849884033s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 233472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 366592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 434176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 434176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 466944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 466944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 466944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 466944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 534528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 534528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 567296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 567296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 567296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 567296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 735232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 735232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 768000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 768000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 768000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 768000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.9831523895263672s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5336413383483887s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.46745753288269043s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0242087841033936s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6498258113861084s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6161227226257324s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0387804508209229s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5843358039855957s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.489365816116333s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.084639549255371s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6798868179321289s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6222915649414062s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.24820876121521s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6479372978210449s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5105531215667725s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.145951509475708s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6538543701171875s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5761466026306152s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4845623970031738s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6651618480682373s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.55460524559021s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.081782341003418s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5795166492462158s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4992384910583496s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.14192533493042s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6035377979278564s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7185189723968506s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2579710483551025s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6511590480804443s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.507249116897583s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 238.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2077677249908447s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.003390789031982422s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5164101123809814s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2660481929779053s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6785335540771484s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5961654186248779s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4194557666778564s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7268083095550537s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5461311340332031s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3952383995056152s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7429358959197998s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9075326919555664s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3474931716918945s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6649174690246582s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5712227821350098s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4214322566986084s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9512057304382324s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6910531520843506s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6161599159240723s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7413544654846191s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7766103744506836s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6750471591949463s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.003296375274658203s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6038086414337158s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8977270126342773s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9200453758239746s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7148220539093018s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.773270845413208s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8617911338806152s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6090264320373535s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 241152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 241152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 257536, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257536, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 257536, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257536, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.089289903640747s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8955142498016357s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7933282852172852s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6730310916900635s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.027360200881958s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6189002990722656s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 236.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.377192258834839s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0640830993652344s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6758365631103516s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.402773380279541s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 349184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 381952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 381952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 381952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 381952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 482304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 482304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 515072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 515072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 515072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 515072, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.78285551071167s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.790651559829712s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8154182434082031s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.278247594833374s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8989050388336182s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1762864589691162s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 432128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 432128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 698368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 698368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 763904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 763904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 763904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 763904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 964608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 964608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1030144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1030144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1030144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1030144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.577185869216919s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2491505146026611s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7602677345275879s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 234.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.36 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.791684865951538s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2859153747558594s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7977027893066406s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6780614852905273s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3233978748321533s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8441951274871826s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.740365743637085s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2907094955444336s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.177889347076416s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.745009422302246s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3827052116394043s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9290802478790283s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.8287532329559326s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3876776695251465s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8190820217132568s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.0606014728546143s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3506171703338623s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8591070175170898s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 232.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.745933771133423s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3523740768432617s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8213198184967041s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.9839930534362793s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.496906042098999s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8184218406677246s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.0979418754577637s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5288279056549072s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8494882583618164s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.252285957336426s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.004141569137573242s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8563632965087891s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.2991631031036377s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5022201538085938s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8538022041320801s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 230.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.498495578765869s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5448570251464844s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8523283004760742s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.52825927734375s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6031606197357178s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0104546546936035s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.137936592102051s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.556880235671997s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.949892520904541s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.461966037750244s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5957205295562744s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9467792510986328s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.070852518081665s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6498074531555176s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9917776584625244s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 228.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.3239054679870605s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7489638328552246s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9379489421844482s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.467419147491455s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 239616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 239616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 239616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 239616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 256512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 289280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 289280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 289280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 289280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 355840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 388608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 388608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 388608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 388608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.755504131317139s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.098724842071533s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0180997848510742s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.3917906284332275s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.3101797103881836s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0687329769134521s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 226.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 9.263354539871216s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 224.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 224.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.37 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 314368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 314368, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 379904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 379904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 379904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 379904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 479232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 479232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 479232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 479232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 513024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 513024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 578560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 578560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 578560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 578560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 711680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 777216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 777216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 777216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 777216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 10.830562353134155s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.693915367126465s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4908199310302734s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 10.97849154472351s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 628736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 628736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 759808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 759808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 759808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 759808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 958464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 958464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 958464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 958464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1026048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1026048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1157120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1157120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1157120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1157120, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1423360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1423360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1554432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1554432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1554432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1554432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.6892292499542236s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38911986351013184s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.30687904357910156s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 76.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6879351139068604s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.42769932746887207s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.329437255859375s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7260704040527344s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4387474060058594s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.39746975898742676s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7281160354614258s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5435876846313477s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.37840747833251953s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8864538669586182s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5189304351806641s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3825032711029053s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.960721492767334s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.47132205963134766s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.39466094970703125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9138970375061035s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5223879814147949s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3736448287963867s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7308955192565918s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.44620800018310547s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.33580613136291504s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8977346420288086s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7137322425842285s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.34291768074035645s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8276610374450684s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5064258575439453s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3502078056335449s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.067542552947998s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5365092754364014s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3644242286682129s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8512518405914307s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5115513801574707s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3718876838684082s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.920119047164917s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5264711380004883s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4116835594177246s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9466478824615479s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.566298246383667s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.43740200996398926s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1071062088012695s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.526606559753418s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.45633745193481445s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.177720069885254s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.768721342086792s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4274454116821289s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4301061630249023s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6415538787841797s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.42043113708496094s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3885080814361572s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6621842384338379s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4451918601989746s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3652503490447998s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7669777870178223s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4610159397125244s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5304932594299316s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 74.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7010109424591064s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5196688175201416s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.7303383350372314s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1459696292877197s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.44887781143188477s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.7154202461242676s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9417815208435059s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5006546974182129s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.349583625793457s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0592787265777588s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5471467971801758s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 244736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 315392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 319488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 319488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 319488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 319488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 386048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 386048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 390144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 390144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 390144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 390144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.510880470275879s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.307586193084717s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8460187911987305s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.805698871612549s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.5006825923919678s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8813536167144775s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 72.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 356352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 356352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 356352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 356352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 489472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 630784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 638976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 638976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 638976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 638976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 772096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 772096, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 780288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 780288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 780288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 780288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 0.8435537815093994s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4684276580810547s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3505737781524658s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8157010078430176s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5323681831359863s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.37955546379089355s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0491282939910889s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6324632167816162s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3687443733215332s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9693076610565186s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5698964595794678s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3987698554992676s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.994401216506958s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5128960609436035s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.39620423316955566s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8957595825195312s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.558398962020874s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38295984268188477s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9372365474700928s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5337975025177002s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4269568920135498s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9156548976898193s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.49073123931884766s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3529493808746338s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9221329689025879s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5333900451660156s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38635730743408203s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1536855697631836s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5275108814239502s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.41078877449035645s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0259864330291748s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.0027832984924316406s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.42501282691955566s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0600135326385498s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6524257659912109s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.40082621574401855s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0696709156036377s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6272509098052979s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4128298759460449s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1351423263549805s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.889866828918457s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4912388324737549s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3318710327148438s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5907411575317383s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.41211819648742676s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3889124393463135s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8929169178009033s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.46093177795410156s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5692577362060547s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7735788822174072s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.47498464584350586s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6411559581756592s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7738308906555176s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6725783348083496s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 70.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.52 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7961504459381104s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9248149394989014s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5011544227600098s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7031898498535156s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0630671977996826s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5364012718200684s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 278016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 278016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.6309447288513184s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8956303596496582s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.48720335960388184s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.0003604888916016s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9654979705810547s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6053953170776367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.7849512100219727s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1027169227600098s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6578397750854492s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 257024, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 339968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 406528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 406528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 556032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 556032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 6.886560678482056s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.371800422668457s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 68.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1015229225158691s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.3806397914886475s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.614715576171875s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.013362169265747s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 514048, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 679936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 813056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 813056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1112064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1112064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.0904018878936768s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5664336681365967s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3864610195159912s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4100468158721924s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.591252326965332s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3967752456665039s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1207458972930908s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6113357543945312s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4384629726409912s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3609027862548828s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6447341442108154s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4421412944793701s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.172593355178833s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6391100883483887s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.43094921112060547s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3330214023590088s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6399593353271484s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4591398239135742s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2691125869750977s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7364680767059326s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.45516157150268555s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.197962999343872s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6473965644836426s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.38663530349731445s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4497623443603516s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6786634922027588s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.45200419425964355s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3861651420593262s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7120561599731445s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.43462252616882324s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4068715572357178s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.0032808780670166016s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.46666932106018066s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.662893533706665s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7151412963867188s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.47615790367126465s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 66.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4645192623138428s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7670283317565918s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5515899658203125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5568029880523682s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7833657264709473s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6841933727264404s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7808070182800293s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8353831768035889s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5087378025054932s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8177378177642822s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9224264621734619s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7038931846618652s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.1916353702545166s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0954580307006836s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5064713954925537s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.0878994464874268s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.0031926631927490234s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.52482008934021s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.190568447113037s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.029674768447876s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5371918678283691s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.211254835128784s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9992356300354004s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5568861961364746s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 306688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 306688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 314880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 314880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 314880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 314880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 3.5550520420074463s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1517488956451416s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5613963603973389s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 64.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.608468770980835s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3026208877563477s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6365609169006348s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.679625034332275s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3380742073059082s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7074956893920898s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 463872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 463872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 463872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 463872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 613376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 613376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 629760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 629760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 629760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 629760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 8.24333930015564s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6616246700286865s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9932739734649658s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 62.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.838382720947266s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 729088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 729088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 761856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 761856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 761856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 761856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 927744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 927744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 927744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 927744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1226752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1226752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1259520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1259520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1259520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1259520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.333728551864624s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9741692543029785s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5558607578277588s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.7010092735290527s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0556650161743164s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5470688343048096s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.450766086578369s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0898175239562988s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8188917636871338s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6552906036376953s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0663738250732422s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6016709804534912s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.5987627506256104s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0263035297393799s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6767642498016357s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.7210144996643066s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0204486846923828s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.614182710647583s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 60.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.53 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.7602884769439697s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0571949481964111s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6267237663269043s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.647933006286621s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9846975803375244s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5680670738220215s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.754499912261963s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0354676246643066s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.588914155960083s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.931586742401123s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1365456581115723s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6373190879821777s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.8904852867126465s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.14112305641174316s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5947198867797852s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.019486665725708s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1140179634094238s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6876914501190186s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 58.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.175428867340088s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.112687349319458s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8204576969146729s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.2454752922058105s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2166831493377686s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6668229103088379s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.3426685333251953s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6171464920043945s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.639662504196167s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.8239479064941406s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.258443832397461s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6690225601196289s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.081692457199097s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.307067632675171s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 56.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6953957080841064s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.123711824417114s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7064650058746338s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8023972511291504s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.322245359420776s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.376969575881958s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7755577564239502s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 364032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 380416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 380416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 6.028242111206055s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8080382347106934s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8130929470062256s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 54.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.314016580581665s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8047997951507568s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8820269107818604s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.452376842498779s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 363520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 363520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 363520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 363520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 430080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 529408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 529408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 562176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 562176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 562176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 562176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 728064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 728064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 760832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 760832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 760832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 760832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 11.448482990264893s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.280648231506348s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4616827964782715s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 52.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 462848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 727040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 727040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 727040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 727040, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 860160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 860160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 925696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 925696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 925696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 925696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1058816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1058816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1124352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1124352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1124352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1124352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1456128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1456128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1521664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1521664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1521664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1521664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.428146839141846s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.640364408493042s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1616089344024658s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.564483642578125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6187920570373535s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2367215156555176s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.862403154373169s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.5825343132019043s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.245880126953125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.880247354507446s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.4725282192230225s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2873585224151611s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 50.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.849554061889648s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6708860397338867s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.281620740890503s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.745583772659302s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.5308899879455566s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.291445016860962s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.3013856410980225s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6642332077026367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5786309242248535s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 48.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.54 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.098564147949219s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.9687094688415527s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5861866474151611s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.518315076828003s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.761479139328003s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5685019493103027s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.746150255203247s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.955209970474243s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.9200007915496826s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 46.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.877080917358398s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.667490243911743s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3610637187957764s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.141212463378906s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.017885208129883s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4287219047546387s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.0148937702178955s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.1420817375183105s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4417452812194824s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 44.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 239360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 239360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 7.719076871871948s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.1883902549743652s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5184333324432373s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.484629154205322s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.5938735008239746s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5487051010131836s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.891971349716187s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.8961668014526367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 42.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.0048236846923828125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 9.257888555526733s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 346624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 346624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 379392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 379392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 379392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 379392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 478720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 478720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 12.265568017959595s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.9171974658966064s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.9982192516326904s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 40.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 12.480285167694092s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.878373861312866s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.379201889038086s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 38.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.55 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 429056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 429056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 561152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 693248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 693248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 758784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 758784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 758784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 758784, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 957440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 957440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 11.809521436691284s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.696657657623291s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 858112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 858112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1122304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1122304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1386496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1386496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1517568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1517568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1517568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1517568, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1914880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1914880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.3908696174621582s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6094310283660889s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.3785121440887451s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4608709812164307s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6619534492492676s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4664590358734131s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5869903564453125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6629056930541992s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4497408866882324s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.4750583171844482s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6932578086853027s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.46971654891967773s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5020930767059326s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6992261409759521s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4776594638824463s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5826818943023682s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7325599193572998s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5596516132354736s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5360822677612305s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7482321262359619s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.47079968452453613s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6244652271270752s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6642742156982422s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.46008729934692383s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7974128723144531s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7657811641693115s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.482452392578125s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7750086784362793s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.993119478225708s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.46810221672058105s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8732283115386963s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8175673484802246s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5403792858123779s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8839638233184814s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9344329833984375s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4993572235107422s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.9738190174102783s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.921602725982666s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5270700454711914s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 36.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 254720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 254720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 255744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 2.9389686584472656s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9776608943939209s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.721153974533081s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.328566074371338s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1687307357788086s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7047884464263916s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.064958572387695s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2405052185058594s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7056465148925781s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 237056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 237056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 239104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 239104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 239104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 239104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 307200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 373248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 373248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 375296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 375296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 375296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 375296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 509440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 509440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 511488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 5.935272693634033s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.545255661010742s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8536858558654785s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 34.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.657502174377441s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.3809196949005127s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9746437072753906s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 474112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 474112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 478208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 478208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 478208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 478208, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 610304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 610304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 614400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 614400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 614400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 614400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 746496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 746496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 750592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 750592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 750592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 750592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1018880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1018880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1022976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 22.241742372512817s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.0289146900177s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.4155397415161133s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 684032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 684032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 684032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 684032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 948224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 948224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 956416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 956416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 956416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 956416, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1220608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1220608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1228800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1228800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1228800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1228800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1492992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1492992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1501184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1501184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1501184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1501184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2037760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2037760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2045952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 1.6654775142669678s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6987285614013672s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.46905040740966797s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.6758484840393066s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8101885318756104s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7105352878570557s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8995463848114014s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8054959774017334s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.47978901863098145s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7944765090942383s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8776211738586426s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5182280540466309s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7438545227050781s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.814023494720459s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5291764736175537s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 32.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.2716925144195557s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8169729709625244s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.502709150314331s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.8031582832336426s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0514512062072754s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5342199802398682s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.9495587348937988s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8109526634216309s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5593302249908447s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.098154067993164s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8938405513763428s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5223357677459717s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.4524009227752686s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9562726020812988s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8658204078674316s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.5287926197052s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0612642765045166s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6138486862182617s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.1424713134765625s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.024322271347046s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5856528282165527s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.351151943206787s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1730172634124756s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6047327518463135s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 261888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 3.2446868419647217s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 30.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1245031356811523s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5670485496520996s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.817992925643921s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.297480583190918s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8131206035614014s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 4.6364405155181885s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.439967393875122s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7062406539916992s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 243200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 313344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 317440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 383488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 383488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 387584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 387584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 387584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 387584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 523776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 523776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 6.749169826507568s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.6596052646636963s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9074513912200928s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 28.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.56 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.860004186630249s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.4395840167999268s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0885822772979736s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 346112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 346112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 354304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 486400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 486400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 626688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 634880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 634880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 634880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 634880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 766976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 766976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 775168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 775168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 775168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 775168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1047552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1047552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 24.700168132781982s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.437432765960693s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.4199228286743164s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 411648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 692224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 692224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 708608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 972800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 972800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1253376, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1269760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1269760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1269760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1269760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1533952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1533952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1550336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1550336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1550336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1550336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2095104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2095104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 3.121896266937256s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0428099632263184s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6234233379364014s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 26.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.9806389808654785s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2036631107330322s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6355829238891602s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.0641424655914307s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1515545845031738s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.646599292755127s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.120051383972168s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.219144582748413s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6336996555328369s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.227719306945801s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2983787059783936s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.648597002029419s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.2070751190185547s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1535370349884033s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7411158084869385s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.3133440017700195s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2201085090637207s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6529905796051025s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 24.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.082692861557007s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2449371814727783s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.5935671329498291s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.3032917976379395s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5225858688354492s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6467459201812744s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.437588930130005s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2381985187530518s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6682257652282715s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.7129526138305664s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2800922393798828s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.6987648010253906s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.6680898666381836s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3012008666992188s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.779672384262085s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 22.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.7857558727264404s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.3570668697357178s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8044769763946533s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 276224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 276224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 4.762149810791016s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.5667040348052979s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.7456066608428955s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 5.477536916732788s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7072887420654297s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.8421585559844971s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 6.106162071228027s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 20.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7843899726867676s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9055137634277344s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 255488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 255488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 412160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 412160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 412160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 412160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 552448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 552448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 560640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 560640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 560640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 560640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 8.529049634933472s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.0771217346191406s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1195790767669678s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 11.513381719589233s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.836657762527466s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.298607587814331s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 18.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.57 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 510976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 510976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 675840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 807936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 824320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 824320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 824320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 824320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1104896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1104896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1121280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1121280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1121280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1121280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 30.039280891418457s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.3039727210998535s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.613384246826172s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1021952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1021952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1351680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1351680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1351680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1351680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1615872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1615872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1648640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1648640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1648640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1648640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2209792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2209792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2242560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2242560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2242560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2242560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 6.6451661586761475s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.0081593990325928s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9920327663421631s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 16.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.377732992172241s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.00347900390625s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1749897003173828s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.304524898529053s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.9663889408111572s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.9999239444732666s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.501835346221924s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.0289182662963867s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2674453258514404s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.546030521392822s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 14.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.1539206504821777s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0315632820129395s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.50945520401001s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.068497657775879s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1107735633850098s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.639242649078369s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.118880033493042s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1335670948028564s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 7.542043685913086s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.1045868396759033s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.0928034782409668s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 12.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.052458047866821s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.2213785648345947s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.07881498336792s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.08164668083191s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.2511954307556152s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1090149879455566s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.324692964553833s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.2822751998901367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 10.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1038963794708252s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.203450679779053s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.3637242317199707s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.166776418685913s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 8.533335447311401s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.439464569091797s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.1518568992614746s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 8.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.58 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 304896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 304896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 313088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 313088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 313088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 313088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] First execution including JIT compilation took 9.856246948242188s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.7604939937591553s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2217411994934082s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 10.088243961334229s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.0190610885620117s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.248652458190918s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 6.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 10.579221248626709s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.1951639652252197s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.2641730308532715s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 444928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 444928, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 461312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 461312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 461312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 461312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 609792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 609792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 626176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 626176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 626176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 626176, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 4.474817276000977s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 1.7217485904693604s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 4.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 560128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 560128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 592896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 592896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 592896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 592896, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 724992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 889856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 889856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 922624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 922624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 922624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 922624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1219584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1219584, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1252352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1252352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1252352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1252352, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 9.443265199661255s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 3.8229305744171143s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1120256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1120256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1185792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1185792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1185792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1185792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1449984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1449984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1515520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1515520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1515520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1515520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1779712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1779712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1845248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1845248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1845248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1845248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2439168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2439168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2504704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2504704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2504704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2504704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 529, in __call__
-    self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 5.715268850326538s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.512333869934082s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 5.526895999908447s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 2.5083181858062744s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263424, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 279808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 279808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 279808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 279808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 362240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 362240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 361984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 361984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 361984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 361984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 428032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 559616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 559616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 559616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 559616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 724480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 724480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 757248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 757248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 757248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 328704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 328704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 328704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 328704, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 460800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 658432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 658432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 723968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 723968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 723968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 723968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 856064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1053696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1119232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1119232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1119232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1119232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1448960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1448960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1514496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1514496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1514496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1514496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 657408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 657408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 657408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 657408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 921600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1052672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1052672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1052672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1052672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1316864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1316864, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1447936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1447936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1447936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1447936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1712128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1712128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1843200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1843200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1843200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1843200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2107392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2107392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2238464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2238464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2238464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2238464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2897920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2897920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 3028992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 3028992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 3028992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 3028992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] added BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None for _chunk_scan_fwd_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-3a41493c29184793fa894c5d134a5c291430843f2ca1b798ab5c9e58228d1814/tune_features-3e88866b92d333f029bc0ae6410b8ce764620f4a7514b0062dd8c43c8e63e3e1/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default and key ('256', '64', '128', 'True', 'torch.float32', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.int32', 'torch.bfloat16', 'torch.bfloat16', 'torch.bfloat16')
-[2025-07-23 21:36:10] Triton autotuning for function _chunk_scan_fwd_kernel finished after 15278.82s; best config selected: BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 64, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 4, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None with benchmark time 0.014237518422305584;  evaluated 2625 configurations;
-[triton-dejavu] ('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16') not in cache, starting to tune...
-[triton-dejavu] [2025-07-23 21:36:10]  Started benchmarking of 2625 configurations... (use_bo: False, run: 0)
-[triton-dejavu] First execution including JIT compilation took 0.19137167930603027s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.19248533248901367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.18099021911621094s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.20834088325500488s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.1988391876220703s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.2113637924194336s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.216780424118042s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.20966219902038574s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.4921605587005615s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.2258141040802002s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.22273588180541992s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.21141862869262695s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.265488862991333s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.22527718544006348s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.1997981071472168s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.29380369186401367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.26201629638671875s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.20731806755065918s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.2723116874694824s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.27080583572387695s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.23759222030639648s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.1978166103363037s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.18923354148864746s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.21349525451660156s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.2810091972351074s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.22581052780151367s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.35887718200683594s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.2530679702758789s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.24747061729431152s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.24676847457885742s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.29694175720214844s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.25411462783813477s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.2558891773223877s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.25450968742370605s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.26735782623291016s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.29147815704345703s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.2686631679534912s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-[triton-dejavu] First execution including JIT compilation took 0.31691837310791016s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-[triton-dejavu] First execution including JIT compilation took 0.2369706630706787s.
-bench_cudagraph failed with CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 351, in _do_bench_cudagraph
-    cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 246.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.67 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 127.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 277504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 277504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 348160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 418816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 418816, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 560128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 560128, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 243712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 242688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 242688, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 326656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 326656, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 261632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 294400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 289792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 289792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 364544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 439296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 439296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 588800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 342016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 253952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 253952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 253952, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 256000, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 305152, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 358400, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 407552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 407552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 407552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 407552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 236544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 236544, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 269312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 269312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 269312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 269312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 268288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 270336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 270336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 337920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 403456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 473088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 473088, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 538624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 538624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 538624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 538624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 351232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 351232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 401408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 401408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 702464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 702464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 290304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 290304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 580608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 580608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1161216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1161216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 302080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 302080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 302080, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 303104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 376832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 376832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 376832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 376832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 378880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 452608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 604160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 604160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 604160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 604160, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 293888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 334848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 334848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 334848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 334848, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 251904, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 333824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 335872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 417792, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 419840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 587776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 669696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 669696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 669696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 669696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 249856, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 250880, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 300032, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 351232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 351232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 400384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 301056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 401408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 401408, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 499712, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 501760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 600064, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 702464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 702464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 800768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 232960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 232960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 265216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 332800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 332800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 398336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 398336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 398336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 465920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 465920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 531456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 266240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 399360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 530432, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 532480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 532480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 665600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 665600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 796672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 796672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 796672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 796672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 931840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 931840, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1062912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 347648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 347648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 695296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 695296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 794624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 794624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1390592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1390592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282112, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 281600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 352256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 352256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 352256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 352256, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 353280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 422912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 494592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 564224, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 280576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 282624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 421888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 423936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 563200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 565248, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 704512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 704512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 704512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 704512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 706560, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 845824, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 989184, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1128448, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 261632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 261632, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 298496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 298496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 298496, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 372736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 372736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 372736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 372736, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 373760, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 447488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 523264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 596992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 596992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 596992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 596992, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 299008, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 446464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 446464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 446464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 446464, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 448512, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 598016, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 745472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 745472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 745472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 745472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 747520, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 894976, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1046528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1193984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1193984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1193984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1193984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 290304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 290304, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248832, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330752, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 413696, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 414720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 580608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 580608, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 662528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 331776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 497664, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 661504, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 663552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 827392, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 829440, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1161216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1161216, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1325056, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247808, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 248320, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297472, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 347648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 347648, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396800, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 296960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 297984, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 495616, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 496640, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 594944, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 695296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 695296, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 793600, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 397312, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 593920, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 595968, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 794624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 794624, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 991232, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 993280, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1189888, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1390592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1390592, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1587200, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 247552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 247552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263936, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263680, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 329728, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 330240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 330240, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395776, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 462336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 462336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527872, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with Triton Error [CUDA]: out of memory
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 408, in _init_handles
-    self.module, self.function, self.n_regs, self.n_spills = driver.active.utils.load_binary(
-                                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RuntimeError: Triton Error [CUDA]: out of memory
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 263168, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 395264, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 396288, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 527360, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 659456, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 660480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 660480, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 791552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 791552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 791552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 791552, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 924672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 924672, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1055744, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 264192, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 526336, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 528384, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 790528, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 792576, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1054720, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1056768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1056768, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1318912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1320960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1320960, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1583104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1583104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1583104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1583104, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 1849344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 1849344, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-bench_cudagraph failed with out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-Traceback (most recent call last):
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 342, in _do_bench_cudagraph
-    fn()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 214, in __call__
-    return jit_first_time()
-           ^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/testing.py", line 205, in jit_first_time
-    ret = self.call_lambda()
-          ^^^^^^^^^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/triton-dejavu/triton_dejavu/autotuner.py", line 421, in kernel_call
-    self.fn.run(
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/runtime/jit.py", line 591, in run
-    kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
-    ^^^^^^^^^^
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 413, in __getattribute__
-    self._init_handles()
-  File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/triton/compiler/compiler.py", line 401, in _init_handles
-    raise OutOfResources(self.metadata.shared, max_shared, "shared memory")
-triton.runtime.errors.OutOfResources: out of resource: shared memory, Required: 2111488, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
-
-[triton-dejavu] added BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 16, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None for _chunk_state_varlen_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-f10105bbcf94b3788568aecfef8eb69570d7757afd57bef99faf7bf930a4edcf/tune_features-a17bcb1c348fee486b4e400e9ec475828d4f0d3118d72067b1bc6f94903360fa/kernel_configs-31086bbabdaa5bbed7ee80f8c2feb8195925fe0fe23a8fdfe525b114e663bdea/default and key ('64', '128', '256', 'torch.bfloat16', 'torch.bfloat16', 'torch.float32', 'torch.float32', 'torch.bfloat16', 'torch.int32', 'torch.bfloat16')
-[2025-07-24 03:00:55] Triton autotuning for function _chunk_state_varlen_kernel finished after 19485.39s; best config selected: BLOCK_SIZE_M: 16, BLOCK_SIZE_N: 16, BLOCK_SIZE_K: 16, num_warps: 2, num_ctas: 1, num_stages: 1, num_buffers_warp_spec: 0, num_consumer_groups: 0, reg_dec_producer: 0, reg_inc_consumer: 0, maxnreg: None with benchmark time nan;  evaluated 2625 configurations;
-ERROR 07-24 03:00:55 [dump_input.py:69] Dumping input data for V1 LLM engine (v0.1.dev7919+g84c7525) with config: model='/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf', speculative_config=None, tokenizer='/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=132096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"/home/zrlngl/.cache/vllm/torch_compile_cache/9bcd1b9f98","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":512,"local_cache_dir":"/home/zrlngl/.cache/vllm/torch_compile_cache/9bcd1b9f98/rank_0_0/backbone"}, 
-ERROR 07-24 03:00:55 [dump_input.py:76] Dumping scheduler output for model execution: SchedulerOutput(scheduled_new_reqs=[NewRequestData(req_id=0,prompt_token_ids_len=64,mm_inputs=[],mm_hashes=[],mm_positions=[],sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=True, max_tokens=1, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None, extra_args=None),block_ids=([1], [2], [3], [4], [5], [6], [7], [8], [9], [10]),num_computed_tokens=0,lora_request=None)], scheduled_cached_reqs=CachedRequestData(req_ids=[], resumed_from_preemption=[], new_token_ids=[], new_block_ids=[], num_computed_tokens=[]), num_scheduled_tokens={0: 64}, total_num_scheduled_tokens=64, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, num_common_prefix_blocks=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], finished_req_ids=[], free_encoder_input_ids=[], structured_output_request_ids={}, grammar_bitmask=null, kv_connector_metadata=null)
-ERROR 07-24 03:00:55 [dump_input.py:79] Dumping scheduler stats: SchedulerStats(num_running_reqs=1, num_waiting_reqs=0, kv_cache_usage=0.009856630824372714, prefix_cache_stats=PrefixCacheStats(reset=False, requests=0, queries=0, hits=0), spec_decoding_stats=None, num_corrupted_reqs=0)
-ERROR 07-24 03:00:55 [core.py:615] EngineCore encountered a fatal error.
-ERROR 07-24 03:00:55 [core.py:615] Traceback (most recent call last):
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 606, in run_engine_core
-ERROR 07-24 03:00:55 [core.py:615]     engine_core.run_busy_loop()
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 633, in run_busy_loop
-ERROR 07-24 03:00:55 [core.py:615]     self._process_engine_step()
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 658, in _process_engine_step
-ERROR 07-24 03:00:55 [core.py:615]     outputs, model_executed = self.step_fn()
-ERROR 07-24 03:00:55 [core.py:615]                               ^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 262, in step
-ERROR 07-24 03:00:55 [core.py:615]     model_output = self.execute_model(scheduler_output)
-ERROR 07-24 03:00:55 [core.py:615]                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 248, in execute_model
-ERROR 07-24 03:00:55 [core.py:615]     raise err
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/engine/core.py", line 239, in execute_model
-ERROR 07-24 03:00:55 [core.py:615]     return self.model_executor.execute_model(scheduler_output)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/executor/abstract.py", line 87, in execute_model
-ERROR 07-24 03:00:55 [core.py:615]     output = self.collective_rpc("execute_model",
-ERROR 07-24 03:00:55 [core.py:615]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/executor/uniproc_executor.py", line 58, in collective_rpc
-ERROR 07-24 03:00:55 [core.py:615]     answer = run_method(self.driver_worker, method, args, kwargs)
-ERROR 07-24 03:00:55 [core.py:615]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/utils/__init__.py", line 2990, in run_method
-ERROR 07-24 03:00:55 [core.py:615]     return func(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
-ERROR 07-24 03:00:55 [core.py:615]     return func(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/worker/gpu_worker.py", line 327, in execute_model
-ERROR 07-24 03:00:55 [core.py:615]     output = self.model_runner.execute_model(scheduler_output,
-ERROR 07-24 03:00:55 [core.py:615]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
-ERROR 07-24 03:00:55 [core.py:615]     return func(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/v1/worker/gpu_model_runner.py", line 1404, in execute_model
-ERROR 07-24 03:00:55 [core.py:615]     model_output = self.model(
-ERROR 07-24 03:00:55 [core.py:615]                    ^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return self._call_impl(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return forward_call(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/model_executor/models/granitemoehybrid.py", line 634, in forward
-ERROR 07-24 03:00:55 [core.py:615]     hidden_states = self.model(input_ids, positions, mamba_cache_params,
-ERROR 07-24 03:00:55 [core.py:615]                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/compilation/decorators.py", line 246, in __call__
-ERROR 07-24 03:00:55 [core.py:615]     model_output = self.forward(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/model_executor/models/granitemoehybrid.py", line 358, in forward
-ERROR 07-24 03:00:55 [core.py:615]     def forward(
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return self._call_impl(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return forward_call(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 838, in _fn
-ERROR 07-24 03:00:55 [core.py:615]     return fn(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 830, in call_wrapped
-ERROR 07-24 03:00:55 [core.py:615]     return self._wrapped_call(self, *args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 406, in __call__
-ERROR 07-24 03:00:55 [core.py:615]     raise e
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 393, in __call__
-ERROR 07-24 03:00:55 [core.py:615]     return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return self._call_impl(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return forward_call(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "<eval_with_key>.82", line 220, in forward
-ERROR 07-24 03:00:55 [core.py:615]     submod_1 = self.submod_1(getitem, s0, getitem_1);  getitem = submod_1 = None
-ERROR 07-24 03:00:55 [core.py:615]                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 830, in call_wrapped
-ERROR 07-24 03:00:55 [core.py:615]     return self._wrapped_call(self, *args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 406, in __call__
-ERROR 07-24 03:00:55 [core.py:615]     raise e
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/fx/graph_module.py", line 393, in __call__
-ERROR 07-24 03:00:55 [core.py:615]     return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return self._call_impl(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
-ERROR 07-24 03:00:55 [core.py:615]     return forward_call(*args, **kwargs)
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "<eval_with_key>.2", line 5, in forward
-ERROR 07-24 03:00:55 [core.py:615]     mamba_mixer2 = torch.ops.vllm.mamba_mixer2(x_3, output, 'model.layers.0.mixer', None);  x_3 = output = mamba_mixer2 = None
-ERROR 07-24 03:00:55 [core.py:615]                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/venv_cuda/lib/python3.12/site-packages/torch/_ops.py", line 1158, in __call__
-ERROR 07-24 03:00:55 [core.py:615]     return self._op(*args, **(kwargs or {}))
-ERROR 07-24 03:00:55 [core.py:615]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/model_executor/layers/mamba/mamba_mixer2.py", line 749, in mamba_mixer2
-ERROR 07-24 03:00:55 [core.py:615]     self.forward_cuda(hidden_states=hidden_states,
-ERROR 07-24 03:00:55 [core.py:615]   File "/home/zrlngl/watsonx/vllm-triton-backend/vllm/vllm/model_executor/layers/mamba/mamba_mixer2.py", line 718, in forward_cuda
-ERROR 07-24 03:00:55 [core.py:615]     hidden_states = torch.vstack(ssd_output_list)
-ERROR 07-24 03:00:55 [core.py:615]                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ERROR 07-24 03:00:55 [core.py:615] torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 79.18 GiB of which 2.94 MiB is free. Process 1095261 has 586.00 MiB memory in use. Including non-PyTorch memory, this process has 78.59 GiB memory in use. Of the allocated memory 69.66 GiB is allocated by PyTorch, with 88.00 MiB allocated in private pools (e.g., CUDA Graphs), and 130.95 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
-[(1, 64, 1), (1, 128, 1), (1, 512, 1), (1, 1024, 1), (1, 2048, 1), (1, 4096, 1)]
-====== Measuring batch_size 1, input length 64, output length 1 =====
-VLLM_USE_V1=1 python vllm-triton-backend/vllm/benchmarks/benchmark_latency.py --model /net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf --input-len 64 --output-len 1 --batch-size 1 --output-json /home/zrlngl/watsonx/zrl-triton-results-and-notebooks/vllm_benchmarks_latency/-net-storage149-autofs-css22-nmg-models-cos-1bfc857-fmaas-integration-tests-models-granite-4_0-small-base-pipecleaner-hf/NVIDIA_H100_80GB_HBM3/tuning_ignore/exp_2025-07-23_1140//result_bs_1_il_64_ol_1.json --num-iters-warmup 3 --num-iters 3 --tensor-parallel 1 
-benchmark command returned 256, stopping...
diff --git a/vllm b/vllm
index 8ba5e3324..f0c503f66 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit 8ba5e3324c93ea4c2b791676baa93838dbe0ca9e
+Subproject commit f0c503f66e2f6aafa966318d488fd92ac662cdf0

From f3f623e361c15875635e5f704a31a3313f5c1420 Mon Sep 17 00:00:00 2001
From: Burkhard Ringlein <ngl@zurich.ibm.com>
Date: Wed, 3 Sep 2025 08:08:16 -0400
Subject: [PATCH 61/61] further cleanup

Signed-off-by: Burkhard Ringlein <ngl@zurich.ibm.com>
---
 .../default/cache.json                        |   8 -
 .../default/cache.json                        |   8 -
 .../default/cache.json                        |   8 -
 .../default/cache.json                        |   8 -
 .../ibm_triton_lib/kernels/tmp_triton_attn.py | 486 ------------------
 scripts/high_qps_bench.sh                     |  26 -
 scripts/quantize_g4.py                        |  42 --
 scripts/quantize_g4_2.py                      | 108 ----
 8 files changed, 694 deletions(-)
 delete mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-21ff5d19d1819793851ad7c7a60e8f4d7bd7bc84238d0302676bb9e213122e34/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
 delete mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
 delete mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
 delete mode 100755 g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json
 delete mode 100644 ibm-triton-lib/ibm_triton_lib/kernels/tmp_triton_attn.py
 delete mode 100755 scripts/high_qps_bench.sh
 delete mode 100644 scripts/quantize_g4.py
 delete mode 100644 scripts/quantize_g4_2.py

diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-21ff5d19d1819793851ad7c7a60e8f4d7bd7bc84238d0302676bb9e213122e34/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-21ff5d19d1819793851ad7c7a60e8f4d7bd7bc84238d0302676bb9e213122e34/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
deleted file mode 100755
index 550944b2a..000000000
--- a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-21ff5d19d1819793851ad7c7a60e8f4d7bd7bc84238d0302676bb9e213122e34/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
deleted file mode 100755
index 550944b2a..000000000
--- a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-4452dd34c8d5c1eade558a6589c89cd1205e0da4d4ef8a72ee7c4c702061e9ba/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
deleted file mode 100755
index 550944b2a..000000000
--- a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_selective_scan_update_kernel/autotune_config-bef61f0485b4347899c813bd65c9c1d763e62f3d6b5fda018baf600097187c0a/code_version-6e39dd3d45fb273ddd153a523337e2ddf49f78a75c71658d852d1a7dc1326857/tune_features-93313ae47bf85925b0b3b8a0af710ff4a94421cf3e6ebd1a348e74369ddc45e8/kernel_configs-85691372c5ea21c12337d65667ec842af16b51057ec486e7af706471f7a50309/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.mamba_ssm:_selective_scan_update_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json b/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json
deleted file mode 100755
index a62237df1..000000000
--- a/g4_tuning_data/dejavu_0.7/triton_3.3.1/cuda_12.4/gpu_NVIDIA_H100_80GB_HBM3/_state_passing_fwd_kernel/autotune_config-215d0c7082adf7c6c8ae2a767088f42b44e6432715b0c6760f5f8e5d4e8371ff/code_version-55db57c88b8fd2c2a9e9560aeb5afd5b585cf3507fa5eed7a0909f4d26b7cd86/tune_features-c5d4b45934fe1d9c636d8b0b8f49b5a26c5fc7064fb2bda916fe2743b77fcdc1/kernel_configs-c4fc6831bf929bccf1df2dabf2b7a316d7b0f7d0a3da7ec749b2f343f3ffe760/default/cache.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "signature": "JITFunction(vllm.model_executor.layers.mamba.ops.ssd_state_passing:_state_passing_fwd_kernel)",
-    "total_bench_time_s": 0.0,
-    "evaluated_configs": 0,
-    "keys": null,
-    "cache": {},
-    "timings": {}
-}
\ No newline at end of file
diff --git a/ibm-triton-lib/ibm_triton_lib/kernels/tmp_triton_attn.py b/ibm-triton-lib/ibm_triton_lib/kernels/tmp_triton_attn.py
deleted file mode 100644
index ba0242a30..000000000
--- a/ibm-triton-lib/ibm_triton_lib/kernels/tmp_triton_attn.py
+++ /dev/null
@@ -1,486 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Attention layer with PagedAttention and Triton prefix prefill."""
-from dataclasses import dataclass
-from typing import ClassVar, Optional
-
-import torch
-
-from vllm import _custom_ops as ops
-from vllm import envs
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata, AttentionType)
-from vllm.attention.ops.chunked_prefill_paged_decode import (
-    chunked_prefill_paged_decode)
-from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.attention.ops.triton_unified_attention import unified_attention
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.platforms import current_platform
-from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
-from vllm.v1.attention.backends.utils import (
-    AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
-    reorder_batch_to_split_decodes_and_prefills)
-from vllm.v1.core.sched.output import SchedulerOutput
-from vllm.v1.kv_cache_interface import AttentionSpec
-from vllm.v1.worker.gpu_input_batch import InputBatch
-
-logger = init_logger(__name__)
-
-
-@dataclass
-class TritonAttentionMetadata:
-    # NOTE(sang): Definition of context_len, query_len, and seq_len.
-    # |---------- N-1 iteration --------|
-    # |---------------- N iteration ---------------------|
-    # |- tokenA -|......................|-- newTokens ---|
-    # |---------- context_len ----------|
-    # |-------------------- seq_len ---------------------|
-    #                                   |-- query_len ---|
-
-    num_actual_tokens: int  # Number of tokens excluding padding.
-    max_query_len: int
-    query_start_loc: torch.Tensor
-    num_decodes: int
-    max_seq_len: int
-    seq_lens: torch.Tensor
-    block_table: torch.Tensor
-    slot_mapping: torch.Tensor
-    use_split_kv: bool
-    segm_output: torch.Tensor
-    segm_max: torch.Tensor
-    segm_expsum: torch.Tensor
-    BLOCK_M_PREFILL: int
-    BLOCK_Q_PREFILL: int
-    BLOCK_M_DECODE: int
-    BLOCK_Q_DECODE: int
-    num_q_blocks: int
-    block_q_seq_boundaries: torch.Tensor
-
-    # For cascade attention.
-    use_cascade: bool
-    common_prefix_len: int
-    cu_prefix_query_lens: Optional[torch.Tensor]
-    prefix_kv_lens: Optional[torch.Tensor]
-    suffix_kv_lens: Optional[torch.Tensor]
-
-    # Optional aot scheduling
-    scheduler_metadata: Optional[torch.Tensor] = None
-    prefix_scheduler_metadata: Optional[torch.Tensor] = None
-
-
-class TritonAttentionMetadataBuilder(
-        AttentionMetadataBuilder[TritonAttentionMetadata]):
-    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.ALWAYS
-
-    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
-                 vllm_config: VllmConfig, device: torch.device):
-        self.device = device
-        self.block_size = kv_cache_spec.block_size
-        self.kv_cache_spec = kv_cache_spec
-
-        model_config = vllm_config.model_config
-        self.num_heads_q = model_config.get_num_attention_heads(
-            vllm_config.parallel_config)
-        self.num_heads_kv = model_config.get_num_kv_heads(
-            vllm_config.parallel_config)
-        self.headdim = model_config.get_head_size()
-
-    def reorder_batch(self, input_batch: InputBatch,
-                      scheduler_output: SchedulerOutput) -> bool:
-        return reorder_batch_to_split_decodes_and_prefills(input_batch,
-                                                           scheduler_output,
-                                                           decode_threshold=1)
-
-    def build_for_cudagraph_capture(
-        self, common_attn_metadata: CommonAttentionMetadata
-    ) -> TritonAttentionMetadata:
-        attn_metadata = self.build(0, common_attn_metadata)
-        # When doing full graph capture, setting seq_lens to
-        # max_model_len will cause graph capture to be extremely
-        # slow, so here we set it to 1.
-        attn_metadata.seq_lens.fill_(1)
-        return attn_metadata
-
-    def build(self,
-              common_prefix_len: int,
-              common_attn_metadata: CommonAttentionMetadata,
-              fast_build: bool = False) -> TritonAttentionMetadata:
-        num_actual_tokens = common_attn_metadata.num_actual_tokens
-        max_query_len = common_attn_metadata.max_query_len
-
-        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max())
-        query_start_loc = common_attn_metadata.query_start_loc
-        seq_lens = common_attn_metadata.seq_lens
-
-        query_lens = torch.diff(query_start_loc)
-        if max_query_len == 1:
-            num_decodes = len(seq_lens)
-        else:
-            num_decodes = torch.argmax((query_lens != 1).int()).item()
-
-        BLOCK_M_PREFILL = 64
-        BLOCK_M_DECODE  = 16
-        BLOCK_Q_PREFILL = BLOCK_M_PREFILL * self.num_heads_kv // self.num_heads_q
-        BLOCK_Q_DECODE  = BLOCK_M_DECODE  * self.num_heads_kv // self.num_heads_q
-
-        block_q_seq_boundaries = torch.cumsum(torch.cat([torch.tensor([0], dtype=query_lens.dtype, device=query_lens.device), torch.ceil(query_lens[num_decodes:] / BLOCK_Q_PREFILL).to(torch.int)]), dim=0)
-        num_q_blocks = block_q_seq_boundaries[-1].item()
-
-        block_table_tensor = common_attn_metadata.block_table_tensor
-        slot_mapping = common_attn_metadata.slot_mapping
-
-        use_split_kv = (num_q_blocks * self.num_heads_kv < 128)
-
-        NUM_SEGMENTS=16
-
-        if use_split_kv:
-            segm_output = torch.empty(
-                num_decodes,
-                self.num_heads_q,
-                NUM_SEGMENTS,
-                self.headdim, #triton.next_power_of_2(head_size),
-                dtype=torch.float32,
-                device=seq_lens.device,
-            )
-            segm_max = torch.empty(
-                num_decodes,
-                self.num_heads_q,
-                NUM_SEGMENTS,
-                dtype=torch.float32,
-                device=seq_lens.device,
-            )
-            segm_expsum = torch.empty(
-                num_decodes,
-                self.num_heads_q,
-                NUM_SEGMENTS,
-                dtype=torch.float32,
-                device=seq_lens.device,
-            )
-        else:
-            segm_output = None
-            segm_max = None
-            segm_expsum = None
-
-        use_cascade = common_prefix_len > 0
-
-        if use_cascade:
-            cu_prefix_query_lens = torch.tensor([0, num_actual_tokens],
-                                                dtype=torch.int32,
-                                                device=self.device)
-            prefix_kv_lens = torch.tensor([common_prefix_len],
-                                          dtype=torch.int32,
-                                          device=self.device)
-            suffix_kv_lens = (common_attn_metadata.seq_lens_cpu -
-                              common_prefix_len)
-            suffix_kv_lens = suffix_kv_lens.to(self.device)
-        else:
-            cu_prefix_query_lens = None
-            prefix_kv_lens = None
-            suffix_kv_lens = None
-            prefix_scheduler_metadata = None
-
-        attn_metadata = TritonAttentionMetadata(
-            num_actual_tokens=num_actual_tokens,
-            max_query_len=max_query_len,
-            query_start_loc=query_start_loc,
-            num_decodes=num_decodes,
-            max_seq_len=max_seq_len,
-            seq_lens=seq_lens,
-            block_table=block_table_tensor,
-            slot_mapping=slot_mapping,
-            use_cascade=use_cascade,
-            common_prefix_len=common_prefix_len,
-            cu_prefix_query_lens=cu_prefix_query_lens,
-            prefix_kv_lens=prefix_kv_lens,
-            suffix_kv_lens=suffix_kv_lens,
-            prefix_scheduler_metadata=prefix_scheduler_metadata,
-            use_split_kv=use_split_kv,
-            segm_output=segm_output,
-            segm_max=segm_max,
-            segm_expsum=segm_expsum,
-            BLOCK_M_PREFILL=BLOCK_M_PREFILL,
-            BLOCK_Q_PREFILL=BLOCK_Q_PREFILL,
-            BLOCK_M_DECODE=BLOCK_M_DECODE,
-            BLOCK_Q_DECODE=BLOCK_Q_DECODE,
-            num_q_blocks=num_q_blocks,
-            block_q_seq_boundaries=block_q_seq_boundaries,
-        )
-        return attn_metadata
-
-    def can_run_in_cudagraph(
-            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        # Full CUDA Graph always supported
-        return True
-
-
-class TritonAttentionBackend(AttentionBackend):
-
-    accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
-
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return [32, 64, 96, 128, 160, 192, 224, 256]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes.")
-
-    @staticmethod
-    def get_name() -> str:
-        return "TRITON_ATTN_VLLM_V1"
-
-    @staticmethod
-    def get_impl_cls() -> type["TritonAttentionImpl"]:
-        return TritonAttentionImpl
-
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return TritonAttentionMetadata
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> tuple[int, ...]:
-        if block_size % 16 != 0:
-            raise ValueError("Block size must be a multiple of 16.")
-        return (num_blocks, 2, block_size, num_kv_heads, head_size)
-
-    @staticmethod
-    def use_cascade_attention(*args, **kwargs) -> bool:
-        return False
-
-    @staticmethod
-    def get_builder_cls() -> type["TritonAttentionMetadataBuilder"]:
-        return TritonAttentionMetadataBuilder
-
-
-class TritonAttentionImpl(AttentionImpl):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[int] = None,
-    ) -> None:
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        if sliding_window is None:
-            self.sliding_window = (-1, -1)
-        else:
-            self.sliding_window = (sliding_window - 1, 0)
-        self.kv_cache_dtype = kv_cache_dtype
-        if logits_soft_cap is None:
-            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
-            logits_soft_cap = 0
-        self.logits_soft_cap = logits_soft_cap
-        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
-
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-
-        TritonAttentionBackend.validate_head_size(head_size)
-
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "TritonAttentionImpl")
-
-        self.fp8_dtype = current_platform.fp8_dtype()
-        self.force_prefill_decode_attn = \
-            envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
-
-    def forward(
-        self,
-        layer: torch.nn.Module,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: FlashAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward pass with FlashAttention.
-
-        Args:
-            query: shape = [num_tokens, num_heads, head_size]
-            key: shape = [num_tokens, num_kv_heads, head_size]
-            value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
-        """
-        assert output is not None, "Output tensor must be provided."
-
-        if output_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for TritonAttentionImpl")
-
-        if attn_metadata is None:
-            # Profiling run.
-            return output
-
-        assert attn_metadata.use_cascade is False
-
-        # IMPORTANT!
-        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
-        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
-        # in this method. For example, `view` and `slice` (or `[:n]`) operations
-        # are surprisingly slow even in the case they do not invoke any GPU ops.
-        # Minimize the PyTorch ops in this method as much as possible.
-        # Whenever making a change in this method, please benchmark the
-        # performance to make sure it does not introduce any overhead.
-
-        use_prefill_decode_attn = self.force_prefill_decode_attn
-        num_actual_tokens = attn_metadata.num_actual_tokens
-
-        if use_prefill_decode_attn:
-            key_cache, value_cache = PagedAttention.split_kv_cache(
-                kv_cache, self.num_kv_heads, self.head_size)
-        else:
-            key_cache, value_cache = kv_cache.unbind(1)
-
-        if self.kv_sharing_target_layer_name is None:
-            # Reshape the input keys and values and store them in the cache.
-            # Skip this if sharing KV cache with an earlier attention layer.
-            if use_prefill_decode_attn:
-                PagedAttention.write_to_paged_cache(
-                    key,
-                    value,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.slot_mapping,
-                    self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
-                )
-            else:
-                torch.ops._C_cache_ops.reshape_and_cache_flash(
-                    key,
-                    value,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.slot_mapping,
-                    self.kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
-                )
-
-        if self.kv_cache_dtype.startswith("fp8"):
-            key_cache = key_cache.view(self.fp8_dtype)
-            value_cache = value_cache.view(self.fp8_dtype)
-            num_tokens, num_heads, head_size = query.shape
-            assert layer._q_scale == 1.0, \
-                "A non 1.0 q_scale is not currently supported."
-            if not current_platform.is_rocm():
-                # Skip Q quantization on ROCm, since dequantizing back to
-                # f32 in the attention kernel is not supported.
-                query, _ = ops.scaled_fp8_quant(
-                    query.reshape(
-                        (num_tokens, num_heads * head_size)).contiguous(),
-                    layer._q_scale)
-                query = query.reshape((num_tokens, num_heads, head_size))
-
-        cu_seqlens_q = attn_metadata.query_start_loc
-        num_decodes = attn_metadata.num_decodes
-        seqused_k = attn_metadata.seq_lens
-        max_seqlen_q = attn_metadata.max_query_len
-        max_seqlen_k = attn_metadata.max_seq_len
-        block_table = attn_metadata.block_table
-
-        use_split_kv = attn_metadata.use_split_kv
-        segm_output = attn_metadata.segm_output
-        segm_max = attn_metadata.segm_max
-        segm_expsum = attn_metadata.segm_expsum
-
-        BLOCK_M_PREFILL = attn_metadata.BLOCK_M_PREFILL
-        BLOCK_Q_PREFILL = attn_metadata.BLOCK_Q_PREFILL
-        BLOCK_M_DECODE  = attn_metadata.BLOCK_M_DECODE
-        BLOCK_Q_DECODE  = attn_metadata.BLOCK_Q_DECODE
-        num_q_blocks = attn_metadata.num_q_blocks
-        block_q_seq_boundaries = attn_metadata.block_q_seq_boundaries
-
-        if use_prefill_decode_attn:
-            # Compute attention and update output up to `num_actual_tokens`.
-            chunked_prefill_paged_decode(query=query[:num_actual_tokens],
-                                         key=key[:num_actual_tokens],
-                                         value=value[:num_actual_tokens],
-                                         output=output[:num_actual_tokens],
-                                         kv_cache_dtype=self.kv_cache_dtype,
-                                         key_cache=key_cache,
-                                         value_cache=value_cache,
-                                         block_table=block_table,
-                                         query_start_loc=cu_seqlens_q,
-                                         seq_lens=seqused_k,
-                                         max_seq_len=max_seqlen_k,
-                                         max_query_len=max_seqlen_q,
-                                         k_scale=layer._k_scale,
-                                         v_scale=layer._v_scale,
-                                         alibi_slopes=self.alibi_slopes,
-                                         sliding_window=self.sliding_window[0],
-                                         sm_scale=self.scale)
-        else:
-            descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
-
-            unified_attention(
-                q=query[:num_actual_tokens],
-                k=key_cache,
-                v=value_cache,
-                out=output[:num_actual_tokens],
-                cu_seqlens_q=cu_seqlens_q,
-                max_seqlen_q=max_seqlen_q,
-                num_decodes=num_decodes,
-                seqused_k=seqused_k,
-                max_seqlen_k=max_seqlen_k,
-                softmax_scale=self.scale,
-                causal=True,
-                alibi_slopes=self.alibi_slopes,
-                window_size=self.sliding_window,
-                block_table=block_table,
-                softcap=self.logits_soft_cap,
-                q_descale=None,  # Not supported
-                k_descale=layer._k_scale.expand(descale_shape),
-                v_descale=layer._v_scale.expand(descale_shape),
-                use_split_kv=use_split_kv,
-                segm_output=segm_output,
-                segm_max=segm_max,
-                segm_expsum=segm_expsum,
-                BLOCK_M_PREFILL=BLOCK_M_PREFILL,
-                BLOCK_Q_PREFILL=BLOCK_Q_PREFILL,
-                BLOCK_M_DECODE=BLOCK_M_DECODE,
-                BLOCK_Q_DECODE=BLOCK_Q_DECODE,
-                num_q_blocks=num_q_blocks,
-                block_q_seq_boundaries=block_q_seq_boundaries
-            )
-
-        return output
diff --git a/scripts/high_qps_bench.sh b/scripts/high_qps_bench.sh
deleted file mode 100755
index 4812d8b5a..000000000
--- a/scripts/high_qps_bench.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-
-#  uv pip install pandas datasets numpy
-
-# MODEL=meta-llama/Llama-3.1-8B-Instruct
-# MODEL=/net/storage149/autofs/css22/nmg/models/hf/ibm-granite/granite-4.0-tiny-preview/main/
-MODEL=/net/storage149/autofs/css22/nmg/models/cos/1bfc857/fmaas-integration-tests/models/granite-4_0-small-base-pipecleaner-hf
-# MODEL=/net/storage149/autofs/css22/nmg/models/hf/ibm-ai-platform/Bamba-9B-v1/main/
-REQUEST_RATES=(20 20 20)
-TOTAL_SECONDS=120
-
-for REQUEST_RATE in "${REQUEST_RATES[@]}";
-do
-        NUM_PROMPTS=$(($TOTAL_SECONDS * $REQUEST_RATE))
-        echo ""
-        echo "===== RUNNING $MODEL FOR $NUM_PROMPTS PROMPTS WITH $REQUEST_RATE QPS ====="
-        echo ""
-        python3 vllm-triton-backend/vllm/benchmarks/benchmark_serving.py \
-                --model $MODEL \
-                --dataset-name random \
-                --ignore-eos \
-                --num-prompts $NUM_PROMPTS \
-                --request-rate $REQUEST_RATE \
-                --port 8803 \
-        ;
-done;
diff --git a/scripts/quantize_g4.py b/scripts/quantize_g4.py
deleted file mode 100644
index 7d19f6e20..000000000
--- a/scripts/quantize_g4.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import sys
-import os
-
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from transformers import GraniteMoeHybridForCausalLM
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-
-# MODEL_ID = "ibm-granite/granite-4.0-tiny-preview"
-model_path = sys.argv[1]
-store_path = sys.argv[2]
-print(f"Quantizing {model_path} using FP8_DYNAMIC...")
-
-model = AutoModelForCausalLM.from_pretrained(
-    model_path, device_map="auto", torch_dtype="auto",
-)
-# model = GraniteMoeHybridForCausalLM.from_pretrained(
-#     model_path, device_map="auto", torch_dtype="auto",
-# )
-# print(model)
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-
-# Configure the simple PTQ quantization
-recipe = QuantizationModifier(
-  targets="Linear", scheme="FP8_DYNAMIC", 
-  ignore=[
-        "re:.*lm_head",
-        # "re:.*block_sparse_moe",
-        "re:.*block_sparse_moe.router",
-  ]
-  )
-
-# Apply the quantization algorithm.
-oneshot(model=model, recipe=recipe)
-#, output_dir=store_path)
-
-print(f"...done. Saving to {store_path}...")
-# # SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-model.save_pretrained(store_path, save_compressed=True)
-tokenizer.save_pretrained(store_path)
-
-print("...done.")
diff --git a/scripts/quantize_g4_2.py b/scripts/quantize_g4_2.py
deleted file mode 100644
index 2afb038a9..000000000
--- a/scripts/quantize_g4_2.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import sys
-import os
-
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from llmcompressor import oneshot
-from llmcompressor.modifiers.quantization import QuantizationModifier
-
-# MODEL_ID = "ibm-granite/granite-4.0-tiny-preview"
-model_path = sys.argv[1]
-store_path = sys.argv[2]
-print(f"Quantizing {model_path} using FP8...")
-
-
-from datasets import load_dataset
-from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
-
-from llmcompressor.utils import dispatch_for_generation
-
-# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
-# Please consider either downgrading your transformers version to a
-# previous version or upgrading to a version where this bug is fixed
-
-# select a Mixture of Experts model for quantization
-MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, torch_dtype="auto", trust_remote_code=True
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-# Select calibration dataset.
-# its recommended to use more calibration samples for MoE models so each expert is hit
-DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
-NUM_CALIBRATION_SAMPLES = 2048
-MAX_SEQUENCE_LENGTH = 2048
-
-
-# Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
-ds = ds.shuffle(seed=42)
-
-
-def preprocess(example):
-    return {
-        "text": tokenizer.apply_chat_template(
-            example["messages"],
-            tokenize=False,
-        )
-    }
-
-
-ds = ds.map(preprocess)
-
-
-# Tokenize inputs.
-def tokenize(sample):
-    return tokenizer(
-        sample["text"],
-        padding=False,
-        max_length=MAX_SEQUENCE_LENGTH,
-        truncation=True,
-        add_special_tokens=False,
-    )
-
-
-ds = ds.map(tokenize, remove_columns=ds.column_names)
-
-
-
-model = AutoModelForCausalLM.from_pretrained(
-    model_path, device_map="auto", torch_dtype="auto",
-)
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-
-# Configure the simple PTQ quantization
-recipe = QuantizationModifier(
-  targets="Linear", scheme="FP8", 
-  ignore=[
-        "re:.*lm_head",
-        "re:.*self_attn",
-        "re:.*router",
-        # "re:.*block_sparse_moe.gate",
-        "re:.*moe*",
-  ]
-  )
-
-# Apply the quantization algorithm.
-# oneshot(model=model, recipe=recipe)
-
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    trust_remote_code_model=True,
-)
-
-
-print(f"...done. Saving to {store_path}...")
-# Save the model: granite-4.0-tiny-preview-FP8-Dynamic
-# SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
-model.save_pretrained(store_path)
-tokenizer.save_pretrained(store_path)
-
-print("...done.")